In [1]:
import pandas as pd

In [2]:
column_names = ['sentiment', 'text']

# Read the CSV file without header and assign the specified column names
df = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/NLP/SENTIMENT_ANALYSIS/Finance_sentiment.csv',
                 header=None, names=column_names, encoding='latin1')

In [3]:
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4846 non-null   object
 1   text       4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [5]:
df.describe()

Unnamed: 0,sentiment,text
count,4846,4846
unique,3,4838
top,neutral,TELECOMWORLDWIRE-7 April 2006-TJ Group Plc sel...
freq,2879,2


In [6]:
df.sentiment.value_counts()

sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64

In [7]:
df['label'] = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

In [8]:
df.head()

Unnamed: 0,sentiment,text,label
0,neutral,"According to Gran , the company has no plans t...",1
1,neutral,Technopolis plans to develop in stages an area...,1
2,negative,The international electronic industry company ...,0
3,positive,With the new production plant the company woul...,2
4,positive,According to the company 's updated strategy f...,2


In [9]:
min_samples = 604


df_negative = df[df.sentiment=="negative"].sample(min_samples, random_state=42)
df_neutral = df[df.sentiment=="neutral"].sample(min_samples, random_state=42)
df_positive = df[df.sentiment=="positive"].sample(min_samples, random_state=42)

In [10]:
df_balanced = pd.concat([df_negative,df_neutral,df_positive],axis=0)
df_balanced.sentiment.value_counts()

sentiment
negative    604
neutral     604
positive    604
Name: count, dtype: int64

In [11]:
df_balanced.head()

Unnamed: 0,sentiment,text,label
3790,negative,The company decided at the end of 2008 to temp...,0
4670,negative,down to EUR5 .9 m H1 '09 3 August 2009 - Finni...,0
4797,negative,The steelmaker said that the drop in profit wa...,0
2743,negative,Finland-based Stockmann Group has closed seven...,0
4065,negative,Operating loss before non-recurring items was ...,0


In [12]:
df.label.unique()

array([1, 0, 2])

In [13]:
df_balanced.label.value_counts()

label
0    604
1    604
2    604
Name: count, dtype: int64

In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens =  [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(filtered_tokens)


In [15]:
df_balanced['preprocessed_txt'] = df_balanced['text'].apply(preprocess)

In [16]:
df_balanced.head()

Unnamed: 0,sentiment,text,label,preprocessed_txt
3790,negative,The company decided at the end of 2008 to temp...,0,company decide end 2008 temporarily shut ammon...
4670,negative,down to EUR5 .9 m H1 '09 3 August 2009 - Finni...,0,EUR5 .9 m H1 09 3 August 2009 finnish media gr...
4797,negative,The steelmaker said that the drop in profit wa...,0,steelmaker say drop profit explain continue ec...
2743,negative,Finland-based Stockmann Group has closed seven...,0,Finland base Stockmann Group close seven franc...
4065,negative,Operating loss before non-recurring items was ...,0,operate loss non recurring item EUR 0.9 mn com...


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_txt,
    df_balanced.label,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=42,
    stratify=df_balanced.label
)

In [18]:
print(X_train.shape)
X_train.head()

(1449,)


2663    party agree disclose price deal group say pres...
4827    external net sale printing business fall 43.7 ...
2545    POYRY PLC additional information Heikki Maline...
4529             ship delay estimate complete spring 2010
1834                    active shipping essential Finland
Name: preprocessed_txt, dtype: object

In [19]:
y_train.value_counts()

label
1    483
0    483
2    483
Name: count, dtype: int64

In [20]:
y_test.value_counts()

label
1    121
0    121
2    121
Name: count, dtype: int64

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

**using the ngram_range parameter with MultinomialNB classifier**

In [22]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),#using the ngram_range parameter
    ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.80      0.72       121
           1       0.69      0.60      0.64       121
           2       0.62      0.57      0.59       121

    accuracy                           0.66       363
   macro avg       0.66      0.66      0.65       363
weighted avg       0.66      0.66      0.65       363



**Using Bag of words with MultienomialNb classifier**

In [23]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()),
    ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.82      0.72       121
           1       0.68      0.60      0.64       121
           2       0.62      0.52      0.57       121

    accuracy                           0.65       363
   macro avg       0.65      0.65      0.64       363
weighted avg       0.65      0.65      0.64       363



In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
#1. create a pipeline object
clf = Pipeline([
    ('tfidf', TfidfVectorizer()),#using the ngram_range parameter
    ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.82      0.72       121
           1       0.67      0.56      0.61       121
           2       0.63      0.55      0.59       121

    accuracy                           0.64       363
   macro avg       0.65      0.64      0.64       363
weighted avg       0.65      0.64      0.64       363



In [25]:
from sklearn.neighbors import KNeighborsClassifier
#1. create a pipeline object
clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.71      0.65       121
           1       0.51      0.73      0.60       121
           2       0.59      0.22      0.32       121

    accuracy                           0.55       363
   macro avg       0.56      0.55      0.52       363
weighted avg       0.56      0.55      0.52       363



In [36]:
# pip install lazypredict

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from lazypredict.Supervised import LazyClassifier


# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform the training and testing data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert the sparse matrices to dense DataFrames
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Initialize LazyClassifier
lazy_clf = LazyClassifier(random_state=42)

# Evaluate models
models, predictions = lazy_clf.fit(X_train_tfidf_df, X_test_tfidf_df, y_train, y_test)

# Display model performance
print(models)

# Select KNeighborsClassifier for further use
# Create a pipeline object
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

# Fit with X_train and y_train
pipeline.fit(X_train, y_train)

# Get the predictions for X_test and store it in y_pred
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


 97%|█████████▋| 28/29 [03:32<00:05,  5.28s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2628
[LightGBM] [Info] Number of data points in the train set: 1449, number of used features: 134
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


100%|██████████| 29/29 [03:34<00:00,  7.39s/it]


                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
BernoulliNB                        0.65               0.65    None      0.64   
XGBClassifier                      0.64               0.64    None      0.64   
BaggingClassifier                  0.63               0.63    None      0.63   
ExtraTreesClassifier               0.62               0.62    None      0.62   
NearestCentroid                    0.61               0.61    None      0.61   
RandomForestClassifier             0.61               0.61    None      0.61   
NuSVC                              0.60               0.60    None      0.60   
PassiveAggressiveClassifier        0.59               0.59    None      0.59   
LogisticRegression                 0.59               0.59    None      0.59   
AdaBoostClassifier                 0.58               0.58    None      0.58   
CalibratedClassifierCV             0.58 

In [35]:
from sklearn.naive_bayes import BernoulliNB

clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bnb', BernoulliNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.83      0.72       121
           1       0.64      0.65      0.65       121
           2       0.68      0.46      0.55       121

    accuracy                           0.65       363
   macro avg       0.65      0.65      0.64       363
weighted avg       0.65      0.65      0.64       363

