In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

In [None]:
# Show first few rows
print(df.head())

                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral


In [None]:
# Show basic info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB
None


In [None]:
# Check label distribution
print(df['Sentiment'].value_counts())

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64


In [None]:
import pandas as pd
import re

# Load CSV - Full Qualified Path
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

def clean_text(text):

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove ticker symbols (e.g., $ESI)
    text = re.sub(r'\$\w*', '', text)

    # Remove special characters and numbers (optional, depending on use case)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = text.strip()

    return text

# Apply to your dataframe
df['cleaned_sentence'] = df['Sentence'].apply(clean_text)


In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load CSV - Full Qualified Path
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

# Initialize Label Encoder
le = LabelEncoder()

# Fit and transform the 'Sentiment' column to numeric labels
df['label'] = le.fit_transform(df['Sentiment'])

# Check the mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

# Check a few rows
print(df[['Sentiment', 'label']].head())

Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}
  Sentiment  label
0  positive      2
1  negative      0
2  positive      2
3   neutral      1
4   neutral      1


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Load CSV - Full Qualified Path
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

# Ensure df has a 'label' column after encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['Sentiment'])

# Step 1: Split off test set (e.g., 15% of data)
train_val_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df['label'],  # maintain class proportions
    random_state=42
)

# Step 2: Split train and validation sets (e.g., 85% train_val into 85% train, 15% validation)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.15,  # 15% of train_val is validation → about 12.75% of total data
    stratify=train_val_df['label'],
    random_state=42
)

# Check the shape of splits
print(f"Train size: {train_df.shape[0]}")
print(f"Validation size: {val_df.shape[0]}")
print(f"Test size: {test_df.shape[0]}")

# Check label distribution in each set
print("Train label distribution:")
print(train_df['label'].value_counts(normalize=True))
print("Validation label distribution:")
print(val_df['label'].value_counts(normalize=True))
print("Test label distribution:")
print(test_df['label'].value_counts(normalize=True))

Train size: 4220
Validation size: 745
Test size: 877
Train label distribution:
label
1    0.535782
2    0.317062
0    0.147156
Name: proportion, dtype: float64
Validation label distribution:
label
1    0.535570
2    0.316779
0    0.147651
Name: proportion, dtype: float64
Test label distribution:
label
1    0.535918
2    0.316990
0    0.147092
Name: proportion, dtype: float64


In [None]:
#Install Transformers
!pip install transformers



In [None]:
# Confirm the version of transformer
import transformers
print(transformers.__version__)
print(transformers.__file__)

4.53.1
/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [None]:
#Import and load the BERT tokenizer

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_df['cleaned_sentence'] = train_df['Sentence'].apply(clean_text)
val_df['cleaned_sentence'] = val_df['Sentence'].apply(clean_text)
test_df['cleaned_sentence'] = test_df['Sentence'].apply(clean_text)

In [None]:
print(train_df.head())

                                               Sentence Sentiment  label  \
5294  Many of the commercial vessels had got stuck i...  negative      0   
5735  potential defect with third-row seat belts. Te...  negative      0   
2443  Excluding non-recurring items , pre-tax profit...  positive      2   
1092  Profit before taxes was EUR 4.0 mn , down from...   neutral      1   
1978  $SKH http://stks.co/163e Long setup. Watch for...  positive      2   

                                       cleaned_sentence  
5294  Many of the commercial vessels had got stuck i...  
5735  potential defect with thirdrow seat belts Tesl...  
2443  Excluding nonrecurring items  pretax profit su...  
1092  Profit before taxes was EUR  mn  down from EUR...  
1978  Long setup Watch for continuation and volume e...  


In [None]:
print('train_df' in globals())
print('val_df' in globals())
print('test_df' in globals())

True
True
True


In [None]:
print(train_df.head())
print(val_df.head())
print(test_df.head())

                                               Sentence Sentiment  label  \
5294  Many of the commercial vessels had got stuck i...  negative      0   
5735  potential defect with third-row seat belts. Te...  negative      0   
2443  Excluding non-recurring items , pre-tax profit...  positive      2   
1092  Profit before taxes was EUR 4.0 mn , down from...   neutral      1   
1978  $SKH http://stks.co/163e Long setup. Watch for...  positive      2   

                                       cleaned_sentence  
5294  Many of the commercial vessels had got stuck i...  
5735  potential defect with thirdrow seat belts Tesl...  
2443  Excluding nonrecurring items  pretax profit su...  
1092  Profit before taxes was EUR  mn  down from EUR...  
1978  Long setup Watch for continuation and volume e...  
                                               Sentence Sentiment  label  \
3520                                         Long $PCLN  positive      2   
5147  $HLF shorts made a killing last coupl

In [None]:
'''
Tokenize  Texts
Tokenize 'cleaned_sentence' column in Train, Validation, and Test dataframes.
'''
def tokenize_texts(texts, max_length=128):
    return tokenizer(
        list(texts),                  # list of texts
        padding='max_length',         # pad all to max_length
        truncation=True,              # truncate longer texts
        max_length=max_length,        # max token length
        return_tensors='pt'           # PyTorch tensors
    )

train_encodings = tokenize_texts(train_df['cleaned_sentence'])
val_encodings = tokenize_texts(val_df['cleaned_sentence'])
test_encodings = tokenize_texts(test_df['cleaned_sentence'])

In [None]:
import torch

train_labels = torch.tensor(train_df['label'].values)
val_labels = torch.tensor(val_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)

In [None]:
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


In [None]:
train_df.to_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/processed/train_processed.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/processed/val_processed.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/processed/test_processed.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split
import os

# Stratified split into train/val/test
train_val_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df['label'],
    random_state=42
)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.15,
    stratify=train_val_df['label'],
    random_state=42
)

# Save splits to processed folder
processed_dir = '/content/drive/MyDrive/bert-financial-sentiment-classifier/data/processed/'
os.makedirs(processed_dir, exist_ok=True)

train_df.to_csv(os.path.join(processed_dir, 'train_processed.csv'), index=False)
val_df.to_csv(os.path.join(processed_dir, 'val_processed.csv'), index=False)
test_df.to_csv(os.path.join(processed_dir, 'test_processed.csv'), index=False)

print("Train, validation, and test sets saved to processed folder.")

Train, validation, and test sets saved to processed folder.
