# Financial Sentiment Analysis with FastText

## **Financial Sentiment Analysis with bigrams**

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to C:\Users\N I T R O
[nltk_data]     V\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\N I T R O
[nltk_data]     V\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### checking the dataset for its label and content

In [2]:
df = pd.read_csv(r"D:\group_assignment\financial_sentiment_analysis.csv")
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


### preprocessing the text

In [3]:
# preprocessing the text
df.columns = ['content', 'label']
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# applying preprocessing to the 'content' column
df['content'] = df['content'].apply(preprocess_text)
df.head()

Unnamed: 0,content,label
0,geosolutions technology leverage benefon gps s...,positive
1,esi lows bk real possibility,negative
2,last quarter componenta net sales doubled eurm...,positive
3,according finnishrussian chamber commerce majo...,neutral
4,swedish buyout firm sold remaining percent sta...,neutral


### spliting dataset into training and testing dataset

In [4]:
# spliting the dataset into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# saving train and test sets in a format suitable for FastText
train_df['label'] = '__label__' + train_df['label']  # FastText requires labels to start with __label__
test_df['label'] = '__label__' + test_df['label']


### saving the data in FastText format

In [5]:
# saving train and test sets in a format suitable for FastText
train_df['label'] = '__label__' + train_df['label']  # FastText requires labels to start with '__label__'
test_df['label'] = '__label__' + test_df['label']

train_df[['label', 'content']].to_csv('financial_train_fasttext.txt', index=False, sep=' ', header=None)
test_df[['label', 'content']].to_csv('financial_test_fasttext.txt', index=False, sep=' ', header=None)

### Train a FastText model

In [6]:
# training the FastText model
model = fasttext.train_supervised(input='financial_train_fasttext.txt', lr=1.0, epoch=25, wordNgrams=2)
model.save_model('financial_sentiment_model.bin')

### evaluating fasttext model

In [7]:
# testing the model on the test set
test_result = model.test('financial_test_fasttext.txt')
print(f'Number of examples: {test_result[0]}')
print(f'Precision (Accuracy): {test_result[1]}')
print(f'Recall: {test_result[2]}')

Number of examples: 1169
Precision (Accuracy): 0.6467065868263473
Recall: 0.6467065868263473


### checking prediction in new sentences

In [8]:
example_sentence = 'The company reported a large growth in revenue this quarter.'
preprocessed_sentence = preprocess_text(example_sentence)
prediction = model.predict(preprocessed_sentence)
print(f'Prediction for "{example_sentence}": {prediction}')

Prediction for "The company reported a large growth in revenue this quarter.": (('__label____label__neutral',), array([0.87928021]))


## **Financial Sentiment Analysis without bigrams**

In [9]:
from nltk.stem import WordNetLemmatizer

In [10]:
nltk.download('wordnet')

df = df.dropna(subset=['label','content'])

[nltk_data] Downloading package wordnet to C:\Users\N I T R O
[nltk_data]     V\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### preprocessing the text

In [11]:
# preprocessing the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocesses the text by:
    - Lowercasing
    - Removing non-alphabetic characters
    - Tokenizing
    - Removing stopwords
    - Lemmatizing
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(tokens)

# applying preprocessing to the 'content' column
df['cleaned_content'] = df['content'].apply(preprocess_text)
# removing empty rows after preprocessing
df = df[df['cleaned_content'].str.strip() != '']

df.head()


Unnamed: 0,content,label,cleaned_content
0,geosolutions technology leverage benefon gps s...,positive,geosolutions technology leverage benefon gps s...
1,esi lows bk real possibility,negative,esi low bk real possibility
2,last quarter componenta net sales doubled eurm...,positive,last quarter componenta net sale doubled eurm ...
3,according finnishrussian chamber commerce majo...,neutral,according finnishrussian chamber commerce majo...
4,swedish buyout firm sold remaining percent sta...,neutral,swedish buyout firm sold remaining percent sta...


In [12]:
# removing empty rows after preprocessing
df = df[df['cleaned_content'].str.strip() != '']

# formating the labels to match FastText's required format
def format_labels(label):
    return f"__label__{label}"

df['formatted_labels'] = df['label'].apply(format_labels)


### spliting dataset into training and testing dataset

In [13]:
# spliting the dataset into training and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
# saving the training and test data in FastText format
with open("financial_train_fasttext.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

with open("financial_test_fasttext.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

# checking if the training file has valid content
with open("financial_train_fasttext.txt", "r") as f:
    for i in range(5):
        print(f.readline())

__label__negative national conciliator juhani salonius met party wednesday said far apart view propose mediation

__label__neutral rad chart flashing oversold

__label__neutral furthermore company sell warehouse office building loudeac saint marcel lease new joint premise operation morvillars

__label__positive adp news feb finnish solution provider affecto oyj hel afev said today net profit rose eur million usd eur million

__label__positive aim increase sale least one fifth



### training fasttext model

In [14]:
# training the FastText model with tuned parameters
model = fasttext.train_supervised(
    input="financial_train_fasttext.txt", 
    dim=10,  # Vector dimension
    epoch=100,  # More epochs for better training
    lr=0.1,  # Adjusted learning rate
    loss='softmax'  # Use softmax for better classification
)

model.save_model("financial_sentiment_model.bin")

### evaluating fastext model

In [15]:
# testing the FastText model
result = model.test("financial_test_fasttext.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

# printing accuracy
accuracy = result[1]  # Precision is the accuracy
print(f"Test Accuracy: {accuracy:.4f}")

Number of examples: 1169
Precision (Accuracy): 0.6449957228400343
Recall: 0.6449957228400343
Test Accuracy: 0.6450


### checking prediction on new sentence

In [16]:
example_sentence = "The company's stock price surged after the announcement."
preprocessed_sentence = preprocess_text(example_sentence)
prediction = model.predict(preprocessed_sentence)
print(f"Prediction for '{example_sentence}': {prediction}")


Prediction for 'The company's stock price surged after the announcement.': (('__label__positive',), array([0.99748534]))
