#### **Imports**

In [1]:
import warnings

import numpy as np 
import pandas as pd

from transformers import pipeline

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


#### **Load Data and Pipeline**

In [2]:
df = pd.read_csv("../data/interim/News1.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555 entries, 0 to 4554
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   publish_date       4555 non-null   object
 1   headline_category  4555 non-null   object
 2   headline_text      4555 non-null   object
 3   main_category      4555 non-null   object
 4   sub_category       3363 non-null   object
dtypes: object(5)
memory usage: 178.1+ KB


- We are going to use FinBert model in order to find sentiments for the news headlines
- We will collect all three scores - positive, negative and neutral along with labels

In [4]:
# for pytorch
# model = pipeline(task = "text-classification",  model="ProsusAI/finbert", return_all_scores = True)

# for tensorflow
model = pipeline(task = "text-classification",  model="yiyanghkust/finbert-tone", tokenizer="yiyanghkust/finbert-tone", framework="tf", return_all_scores = True)





All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at yiyanghkust/finbert-tone.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.
Device set to use 0


In [5]:
# df2 = df.sample(frac=1, random_state=42).reset_index(drop=True) # if you want to shuffle
df2 = df

In [6]:
# for storing the predicted values
sentiment = [np.nan for i in range(len(df2))]
pos_score = [np.nan for i in range(len(df2))]
neg_score = [np.nan for i in range(len(df2))]
neu_score = [np.nan for i in range(len(df2))]

In [7]:
def get_sentiment_features(text):
    """
        Get sentiment scores and labels using FinBERT
    """
    result = model.predict(text)[0]
    
    # store the values in dict as label : score
    scores = {res['label']: res['score'] for res in result}

    # Get highest score label as output label for this text
    label = max(scores, key=scores.get)

    # return positive, negative, neutral and label    
    return scores["Positive"], scores["Negative"], scores["Neutral"], label

In [8]:
# for predicting labels only for some of the records


# batch_size = 6

# for i in range(0, len(df2), batch_size):
#     sample = df2.iloc[i]
#     prediction = get_sentiment_features(sample.headline_text)
#     sentiment[i] = prediction[3]
#     pos_score[i] = prediction[0]
#     neg_score[i] = prediction[1]
#     neu_score[i] = prediction[2]

In [9]:
# for each record find prediction

for i in range(0, len(df2)):
    sample = df2.iloc[i]
    prediction = get_sentiment_features(sample.headline_text)
    sentiment[i] = prediction[3]
    pos_score[i] = prediction[0]
    neg_score[i] = prediction[1]
    neu_score[i] = prediction[2]

KeyboardInterrupt: 

In [None]:
# store the predicted values in df

df2['sentiment'] = sentiment
df2['pos_score'] = pos_score
df2['neg_score'] = neg_score
df2['neu_score'] = neu_score

In [None]:
# encoding the categories

label_mapping = {"positive": 1, "neutral": 0, "negative": -1}
df2["sentiment_label_num"] = df2["sentiment"].map(label_mapping)

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555 entries, 0 to 4554
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   publish_date         4555 non-null   object 
 1   headline_category    4555 non-null   object 
 2   headline_text        4555 non-null   object 
 3   main_category        4555 non-null   object 
 4   sub_category         3363 non-null   object 
 5   sentiment            4555 non-null   object 
 6   pos_score            4555 non-null   float64
 7   neg_score            4555 non-null   float64
 8   neu_score            4555 non-null   float64
 9   sentiment_label_num  0 non-null      float64
dtypes: float64(4), object(6)
memory usage: 356.0+ KB


##### Save the file

In [None]:
df2.to_csv("../data/interim/News2.csv", index = False)

### Below code was return for semi-supervised learning approach

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
sentiment_encoded = encoder.fit_transform(df2[["sentiment"]].fillna("unknown"))

In [None]:
sentiment_encoded

In [None]:
sentiment_encoded_df = pd.DataFrame(sentiment_encoded, columns=encoder.get_feature_names_out(["sentiment"]))
df3 = pd.concat([df2, sentiment_encoded_df], axis=1)

In [None]:
df3.head()

In [None]:
from transformers import TFAutoModel, AutoTokenizer

# Load the FinBERT model and tokenizer using TensorFlow
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = TFAutoModel.from_pretrained("yiyanghkust/finbert-tone")

In [None]:
def get_embedding(text):
    """Convert text into a numerical vector using FinBERT in TensorFlow"""
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
    
    # Run the model and get the hidden state embeddings
    outputs = model(**inputs)
    
    # The embeddings are in the last_hidden_state attribute (shape: [batch_size, seq_length, hidden_size])
    # We take the mean of all token embeddings (mean pooling)
    last_hidden_state = outputs.last_hidden_state
    embedding = tf.reduce_mean(last_hidden_state, axis=1)  # Mean pooling over tokens
    
    # Convert tensor to numpy array and remove extra dimensions
    return embedding.numpy().squeeze()

In [None]:
df2["embedding"] = df2["headline_text"].apply(lambda x: get_embedding(str(x)))

In [None]:
embedding = np.vstack(df2.embedding.values)

In [None]:
df2.fillna({"pos_score": 0, "neg_score": 0, "neu_score": 0, "sentiment_label_num": 0}, inplace=True)

In [None]:
sentiment_features = df2[["pos_score", "neg_score", "neu_score", "sentiment_label_num"]].values

In [None]:
X = np.hstack((embedding, sentiment_features))

In [None]:
# X = embedding

In [None]:
from sklearn.cluster import KMeans, DBSCAN

In [None]:
num_clusters = 3  # Adjust based on dataset
# kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
dbscan = DBSCAN(min_samples=5)
# clusters = kmeans.fit_predict(X)
clusters = dbscan.fit_predict(X)

In [None]:
df2["cluster"] = clusters

In [None]:
df2.groupby("cluster")["sentiment"].agg(lambda x: x.mode()[0] if not x.isna().all() else "unknown")

In [None]:
df2.cluster.value_counts()

In [None]:
df2

In [None]:
cluster_sentiment_map = df2.groupby("cluster")["sentiment"].agg(lambda x: x.mode()[0] if not x.isna().all() else "unknown")

In [None]:
# Assign labels to previously unlabeled data
df2["final_sentiment_label"] = df2["cluster"].map(cluster_sentiment_map)

In [None]:
df2