In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load CSV - Full Qualified Path
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

# Initialize Label Encoder
le = LabelEncoder()

# Fit and transform the 'Sentiment' column to numeric labels
df['label'] = le.fit_transform(df['Sentiment'])

# Check the mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

# Check a few rows
print(df[['Sentiment', 'label']].head())

Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}
  Sentiment  label
0  positive      2
1  negative      0
2  positive      2
3   neutral      1
4   neutral      1


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Load CSV - Full Qualified Path
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

# Ensure df has a 'label' column after encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['Sentiment'])

# Step 1: Split off test set (e.g., 15% of data)
train_val_df, test_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df['label'],  # maintain class proportions
    random_state=42
)

# Step 2: Split train and validation sets (e.g., 85% train_val into 85% train, 15% validation)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.15,  # 15% of train_val is validation → about 12.75% of total data
    stratify=train_val_df['label'],
    random_state=42
)

# Check the shape of splits
print(f"Train size: {train_df.shape[0]}")
print(f"Validation size: {val_df.shape[0]}")
print(f"Test size: {test_df.shape[0]}")

# Check label distribution in each set
print("Train label distribution:")
print(train_df['label'].value_counts(normalize=True))
print("Validation label distribution:")
print(val_df['label'].value_counts(normalize=True))
print("Test label distribution:")
print(test_df['label'].value_counts(normalize=True))

Train size: 4220
Validation size: 745
Test size: 877
Train label distribution:
label
1    0.535782
2    0.317062
0    0.147156
Name: proportion, dtype: float64
Validation label distribution:
label
1    0.535570
2    0.316779
0    0.147651
Name: proportion, dtype: float64
Test label distribution:
label
1    0.535918
2    0.316990
0    0.147092
Name: proportion, dtype: float64


In [None]:
import pandas as pd
import re

# Load CSV - Full Qualified Path
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

def clean_text(text):

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove ticker symbols (e.g., $ESI)
    text = re.sub(r'\$\w*', '', text)

    # Remove special characters and numbers (optional, depending on use case)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = text.strip()

    return text

# Apply to your dataframe
df['cleaned_sentence'] = df['Sentence'].apply(clean_text)


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/bert-sentiment-analysis/data/raw/financial-data-sentiment-analysis.csv')

In [None]:
# Show basic info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB
None


In [None]:
# Check label distribution
print(df['Sentiment'].value_counts())

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64


In [None]:
# Show first few rows
print(df.head())

                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p bert-sentiment-analysis/data/raw
!mkdir -p bert-sentiment-analysis/data/processed
!mkdir -p bert-sentiment-analysis/notebooks
!mkdir -p bert-sentiment-analysis/src

In [None]:
!touch bert-sentiment-analysis/src/data_preprocessing.py
!touch bert-sentiment-analysis/src/model.py
!touch bert-sentiment-analysis/src/train.py
!touch bert-sentiment-analysis/src/evaluate.py