## About Dataset
Data
The following data is intended for advancing financial sentiment analysis research. It's two datasets (FiQA, Financial PhraseBank) combined into one easy-to-use CSV file. It provides financial sentences with sentiment labels.

Citations
Malo, Pekka, et al. "Good debt or bad debt: Detecting semantic orientations in economic texts." Journal of the Association for Information Science and Technology 65.4 (2014): 782-796.

In [189]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import pandas as pd
import nltk
import re
import numpy as np
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
from collections import Counter, defaultdict

from datasets import Dataset
import torch
import transformers

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,precision_score,accuracy_score,f1_score

from transformers import BertTokenizer, Trainer, BertForSequenceClassification, TrainingArguments, DistilBertTokenizerFast,DistilBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vasundharauniyal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vasundharauniyal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vasundharauniyal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vasundharauniyal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
## Attention mechanism models - bert 
## BOW - idf , cdf
## tokenization

In [190]:
df = pd.read_csv('Sentimental.csv')
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [191]:
def preprocess_text(text):
    text = str(text).lower() 
    text = text.replace('{html}', "")
    clean = re.compile('<.*?>')
    clean_data = re.sub(clean, '', text)
    remove_url = re.sub(r'http\S+', '',clean_data)
    remove_num = re.sub('[0-9]+', '', remove_url)
    tokenizer =  RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(remove_num) 
    words = [i for i in tokens if len(i)>2 if not i in stopwords.words('english')]
    stem_words=[stemmer.stem(i) for i in words]
    lemma_words=[lemmatizer.lemmatize(i) for i in stem_words]
    return " ".join(words)

df['Cleaned_text'] = df['Sentence'].apply(preprocess_text)
df[['Sentence','Cleaned_text']].head()

Unnamed: 0,Sentence,Cleaned_text
0,The GeoSolutions technology will leverage Bene...,geosolutions technology leverage benefon gps s...
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",esi lows real possibility
2,"For the last quarter of 2010 , Componenta 's n...",last quarter componenta net sales doubled eurm...
3,According to the Finnish-Russian Chamber of Co...,according finnish russian chamber commerce maj...
4,The Swedish buyout firm has sold its remaining...,swedish buyout firm sold remaining percent sta...


In [192]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import words

In [193]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [194]:
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df['Cleaned_text']).toarray()
print(vectorizer.get_feature_names_out())
y = df['Sentiment']

['aapl' 'according' 'acquired' 'acquisition' 'activities' 'added'
 'addition' 'adp' 'afx' 'ago' 'agreed' 'agreement' 'ahlstrom' 'alma'
 'already' 'also' 'america' 'amount' 'amounted' 'analyst' 'announced'
 'annual' 'another' 'applications' 'approximately' 'april' 'area' 'areas'
 'around' 'aspo' 'aspocomp' 'astrazeneca' 'august' 'available' 'awarded'
 'back' 'baltic' 'bank' 'barclays' 'base' 'based' 'basware' 'beer' 'bid'
 'billion' 'board' 'brand' 'breakout' 'building' 'built' 'business' 'buy'
 'calls' 'capacity' 'capital' 'capman' 'cargotec' 'cash' 'cent' 'center'
 'ceo' 'chain' 'chairman' 'chief' 'china' 'city' 'close' 'closed' 'com'
 'combined' 'commercial' 'communications' 'companies' 'company' 'compared'
 'completed' 'componenta' 'construction' 'continue' 'continuing'
 'contract' 'cooperation' 'corp' 'corporation' 'corresponding' 'cost'
 'costs' 'could' 'countries' 'credit' 'current' 'currently' 'customer'
 'customers' 'cut' 'daily' 'data' 'day' 'deal' 'december' 'decided'
 'decis

In [195]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [196]:
print(f' Training test size:{X_train.shape[0]}') # 80%
print(f' Test set size:{X_test.shape[0]}') #20%

 Training test size:4673
 Test set size:1169


In [197]:
# Applying logistic regression.

In [198]:
from sklearn.linear_model import LogisticRegression

In [199]:
model = LogisticRegression(max_iter = 1000)

In [200]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [201]:
model.fit(X_train,y_train)

In [202]:
y_pred = model.predict(X_test)

In [203]:
y_pred

array(['neutral', 'positive', 'negative', ..., 'neutral', 'neutral',
       'positive'], dtype=object)

In [204]:
print(f'Accuracy: {accuracy_score(y_test,y_pred):.2f}')
print(classification_report(y_test,y_pred))

Accuracy: 0.66
              precision    recall  f1-score   support

    negative       0.44      0.18      0.26       175
     neutral       0.68      0.85      0.76       622
    positive       0.67      0.57      0.62       372

    accuracy                           0.66      1169
   macro avg       0.60      0.54      0.54      1169
weighted avg       0.64      0.66      0.64      1169



In [136]:
import joblib

In [205]:
from sklearn.svm import LinearSVC

In [206]:
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_text'], 
                                                    df['Sentiment'],
                                                    test_size=0.2,
                                                    random_state=0,
        
                                                   )

In [207]:
X_train.shape

(4673,)

In [208]:
X_test.shape

(1169,)

In [209]:
clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [210]:
clf.fit(X_train, y_train)

In [211]:
y_pred = clf.predict(X_test)

In [212]:
y_pred

array(['positive', 'neutral', 'positive', ..., 'positive', 'positive',
       'positive'], dtype=object)

In [213]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.33      0.22      0.26       192
     neutral       0.71      0.80      0.75       643
    positive       0.71      0.68      0.70       334

    accuracy                           0.67      1169
   macro avg       0.58      0.57      0.57      1169
weighted avg       0.65      0.67      0.66      1169



In [224]:
joblib.dump(clf,'Sentimental_analysis.pkl')
loaded_model = joblib.load('Sentimental_analysis.pkl')

In [227]:
loaded_model.predict(['Wow, this is amazing lesson'])

array(['positive'], dtype=object)

In [228]:
loaded_model.predict(["i am sad"]) 

array(['negative'], dtype=object)

In [217]:
import pickle

pickle.dump(clf, open('sentiment_analysis.pkl', 'wb'))