In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import re
import string
import nltk
import inflect
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb

### Data

In [3]:
df = pd.read_csv("/Users/pin.lyu/Documents/BC_Folder/NLP/Data/financial_data.csv")

df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


### EDA

In [5]:
# Check the types of dependent variable 

sentiment_types = df["Sentiment"].unique()

# Count occurrences of each sentiment type

sentiment_counts = df["Sentiment"].value_counts()

# Results
print("Sentiment Types:\n", sentiment_types)

print("\nSentiment Counts:\n", sentiment_counts)

Sentiment Types:
 ['positive' 'negative' 'neutral']

Sentiment Counts:
 Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64


_Comments_: unbalanced data, treatment needed

### Data Processing

In [8]:
# Initialize number-to-word converter

p = inflect.engine()

# Function to replace numbers with words

def replace_numbers(text):
    
    return re.sub(r'\d+', lambda x: p.number_to_words(x.group()), text)

In [9]:
def preprocess(text):
    
    text = text.lower()                                 # Lower case all words

    text = replace_numbers(text)                        # Convert numbers to words
    
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    
    tokens = word_tokenize(text)                        # Tokenize words
    
    stop_words = set(stopwords.words("english"))        # Activate stop words identifier

    lemmatizer = WordNetLemmatizer()                    # Initialize lemmatizer
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]  # Lemmatize filtered words
    
    return tokens

In [10]:
# Preprocessing the "Sentence" column

df["Processed_Sentence"] = df["Sentence"].apply(preprocess)

# Result

print(df[["Sentence", "Processed_Sentence"]].head())

                                            Sentence  \
0  The GeoSolutions technology will leverage Bene...   
1  $ESI on lows, down $1.50 to $2.50 BK a real po...   
2  For the last quarter of 2010 , Componenta 's n...   
3  According to the Finnish-Russian Chamber of Co...   
4  The Swedish buyout firm has sold its remaining...   

                                  Processed_Sentence  
0  [geosolutions, technology, leverage, benefon, ...  
1  [esi, low, onefifty, twofifty, bk, real, possi...  
2  [last, quarter, two, thousand, ten, componenta...  
3  [according, finnishrussian, chamber, commerce,...  
4  [swedish, buyout, firm, sold, remaining, twent...  


In [11]:
# Encode sentiment labels

sentiment_num = {"positive": 1, 
                     
                     "neutral": 0, 
                     
                     "negative": 2}

# Map numeric labels onto actual data

df["Sentiment_NumLabel"] = df["Sentiment"].map(sentiment_num)

### Models

In [13]:
# Turn list into string

df["Processed_Sentence"] = df["Processed_Sentence"].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Split data with 80/20 rule

X_train, X_test, y_train, y_test = train_test_split(df["Processed_Sentence"], df["Sentiment_NumLabel"], test_size=0.2, random_state=226)

In [14]:
# Dimension Check

X_train.shape

X_test.shape

y_train.shape

y_test.shape

print("X_train shape:", X_train.shape)
     
print("X_test shape:", X_test.shape)
     
print("y_train shape:", y_train.shape)
      
print("y_test shape:", y_test.shape)

X_train shape: (4673,)
X_test shape: (1169,)
y_train shape: (4673,)
y_test shape: (1169,)


In [15]:
X_train = [str(sentence) for sentence in X_train]  

X_test = [str(sentence) for sentence in X_test]

# Convert text into numerical features using TF-IDF

vectorizer = TfidfVectorizer(max_features=3000) 

X_train = vectorizer.fit_transform(X_train)

X_test = vectorizer.transform(X_test)

In [16]:
# Initiate models

models = {
    
    "Naive Bayes": MultinomialNB(),
    
    "Logistic Regression": LogisticRegression(max_iter=1000),  
    
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    
    "XGBoost": xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    
    "LightGBM": lgb.LGBMClassifier(random_state=42),

    "SVM": SVC(kernel='linear', probability=True, random_state=42)
}

In [17]:
# Run & test models

results = {}

for name, model in models.items():
    
    print(f"Training {name}...")

    # Train the model
    
    model.fit(X_train, y_train) 

    # Predict on the test sbet
    
    y_pred = model.predict(X_test)  
    
    # Evaluate the model
    
    accuracy = accuracy_score(y_test, y_pred)
    
    report = classification_report(y_test, y_pred)
    
    confusion = confusion_matrix(y_test, y_pred)
    
    # Store results
    
    results[name] = {
        
        "accuracy": accuracy,
        
        "classification_report": report,
        
        "confusion_matrix": confusion
    }
    
    # Print results
    
    print(f"Results for {name}:")
    
    print(f"Accuracy: {accuracy:.4f}")
    
    print("Classification Report:")
    
    print(report)
    
    print("Confusion Matrix:")
    
    print(confusion)
    
    print("-" * 50)

Training Naive Bayes...
Results for Naive Bayes:
Accuracy: 0.6972
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.94      0.80       653
           1       0.72      0.54      0.61       351
           2       0.52      0.07      0.12       165

    accuracy                           0.70      1169
   macro avg       0.65      0.52      0.51      1169
weighted avg       0.68      0.70      0.65      1169

Confusion Matrix:
[[615  32   6]
 [158 189   4]
 [111  43  11]]
--------------------------------------------------
Training Logistic Regression...
Results for Logistic Regression:
Accuracy: 0.7177
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       653
           1       0.77      0.64      0.70       351
           2       0.43      0.17      0.24       165

    accuracy                           0.72      1169
   macro avg       0.64      0.57     

Parameters: { "use_label_encoder" } are not used.



Results for XGBoost:
Accuracy: 0.6595
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.86      0.75       653
           1       0.80      0.53      0.64       351
           2       0.26      0.16      0.20       165

    accuracy                           0.66      1169
   macro avg       0.58      0.51      0.53      1169
weighted avg       0.65      0.66      0.64      1169

Confusion Matrix:
[[560  25  68]
 [159 185   7]
 [119  20  26]]
--------------------------------------------------
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003650 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11023
[LightGBM] [Info] Number of data points in the train set: 4673, number of used features: 513
[LightGBM] [Info] Start training from score -0.634753
[LightGBM] [Info] Start training from score -1.135670
[LightGBM] [Info] Start 

_Comment_: Linear models seem to perform better here

#### Challenges

The project progressed smoothly overall, with no significant issues. However, a minor challenge arose when the text data was stored in a list format after being processed, which led to difficulties during the initialization of the TF-IDF vectors. After some troubleshooting, I identified the format issue and converted the text data from a list to a string format. Once resolved, I was able to proceed successfully to the modeling stage.

### Conclusions

The top-performing models in this analysis are the Logistic Regression and Support Vector Machine (SVM), both of which exhibit near-identical performance across all evaluation metrics—accuracy, precision, recall, and F1 score. 

Both models are particularly effective at identifying neutral instances, which is expected due to the high representation of neutral observations in the dataset. However, both models face challenges in accurately classifying positive and negative sentences, with a notable struggle in detecting negative sentiment. This performance is anticipated, given that these models are relatively simple and do not incorporate advanced techniques such as under- or over-sampling to address the class imbalance in the dataset. The performance of these models could likely be improved with the integration of such techniques.

Although the SVM model achieved slightly better performance than the multinomial logistic model, I would personally prefer the multinomial logistic model due to its faster computation. Based on the current results, it proves to be more efficient overall. 