In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [None]:
import pandas as pd
df = pd.read_pickle('/content/text_label_finbert.pkl')
df

Unnamed: 0,text,label
5123,amd forecasts 2 billion sales of ai chips help...,1
5124,qualcomm unveils snapdragon x intensifying com...,2
5125,amd q3 results eyed as aidriven future looms b...,1
5126,nvidia shares drop after report of canceled ch...,1
5127,dow jones futures rise in anticipation of fede...,2
...,...,...
64647,exclusive tesla supplier panasonic eyes 20 jum...,0
64648,dow futures tick higher ahead of key inflation...,1
64649,twitter sues elon musk to hold him to 44 billi...,1
64650,trump steps up attacks on musk who said trump ...,2


In [None]:
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, ClassLabel
import os

# Configuration
MODEL_NAME = "ProsusAI/finbert"
MAX_LENGTH = 512
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
OUTPUT_DIR = "./stock_sentiment_results"

# Check for CUDA (GPU) availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_data(file_path):
    df = pd.read_pickle(file_path)
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    labels = ClassLabel(names=list(map(str, sorted(df['label'].unique()))))

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def preprocess_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=MAX_LENGTH,
            padding=False
        )

    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=["text"]
    )
    eval_dataset = eval_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=["text"]
    )

    return train_dataset, eval_dataset, labels, tokenizer

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

def train():
    train_dataset, eval_dataset, labels, tokenizer = load_data("/content/text_label_finbert.pkl")

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding="longest",
        max_length=MAX_LENGTH
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(labels.names),
        ignore_mismatched_sizes=True
    ).to(device)  # Move model to GPU if available

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_dir=os.path.join(OUTPUT_DIR, "logs"),
        logging_steps=10,
        report_to="none",
        save_total_limit=2,
        seed=42,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        fp16=torch.cuda.is_available()  # Enable mixed precision training if GPU is available
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()
    model.save_pretrained(os.path.join(OUTPUT_DIR, "model"))
    tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "model"))

    eval_results = trainer.evaluate()
    print(f"Final evaluation results: {eval_results}")

if __name__ == "__main__":
    train()

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/47623 [00:00<?, ? examples/s]

Map:   0%|          | 0/11906 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1506,0.220342,0.919368,0.919541
2,0.0771,0.260035,0.937427,0.937874
3,0.0516,0.272438,0.950109,0.950245


Final evaluation results: {'eval_loss': 0.27243772149086, 'eval_accuracy': 0.950109188644381, 'eval_f1': 0.9502450404838204, 'eval_runtime': 90.516, 'eval_samples_per_second': 131.535, 'eval_steps_per_second': 8.231, 'epoch': 3.0}


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# ระบุ path โมเดลที่บันทึกไว้
model_path = "/content/stock_sentiment_results/model"

# โหลดจาก local
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# ฟังก์ชันทำนาย (เหมือนเดิม)
def predict_sentiment(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return prediction.detach().numpy()

print(predict_sentiment("The AOT Property Showcase: The Six Pillars of Opportunity held on Tuesday, aims to attract private sector investment to transform these areas into new economic hubs, encompassing a total of 2,512 rai (approximately 623 acres) with a potential investment value geared towards establishing Thailand as a leading aviation centre in the region"))

[[9.9969387e-01 1.2324666e-04 1.8298386e-04]]


In [None]:
print(model.config.id2label)

{0: 'positive', 1: 'negative', 2: 'neutral'}


In [None]:
from transformers import pipeline

# Load fine-tuned model
model_path = "/content/stock_sentiment_results/model"
classifier = pipeline(
    "text-classification",
    model=model_path,
    tokenizer=model_path
)

# Example prediction
texts = [
    "The AOT Property Showcase: The Six Pillars of Opportunity held on Tuesday, aims to attract private sector investment to transform these areas into new economic hubs, encompassing a total of 2,512 rai (approximately 623 acres) with a potential investment value geared towards establishing Thailand as a leading aviation centre in the region",
    "ABC Pharmaceuticals (NASDAQ: ABCP) nosedived 20% after reports surfaced that the FDA is probing potential data manipulation in key clinical trials for its blockbuster drug. Investors fear recalls, lawsuits, and regulatory penalties.",
    "GHI Automotive (NYSE: GHI) warned of a 40% drop in Q3 earnings due to semiconductor shortages and rising logistics costs. Analysts downgraded the stock, predicting further pain ahead"

]

results = classifier(texts)
for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Predicted sentiment: {result['label']} (confidence: {result['score']:.2f})")
    print()

Device set to use cuda:0


Text: The AOT Property Showcase: The Six Pillars of Opportunity held on Tuesday, aims to attract private sector investment to transform these areas into new economic hubs, encompassing a total of 2,512 rai (approximately 623 acres) with a potential investment value geared towards establishing Thailand as a leading aviation centre in the region
Predicted sentiment: positive (confidence: 1.00)

Text: ABC Pharmaceuticals (NASDAQ: ABCP) nosedived 20% after reports surfaced that the FDA is probing potential data manipulation in key clinical trials for its blockbuster drug. Investors fear recalls, lawsuits, and regulatory penalties.
Predicted sentiment: negative (confidence: 1.00)

Text: GHI Automotive (NYSE: GHI) warned of a 40% drop in Q3 earnings due to semiconductor shortages and rising logistics costs. Analysts downgraded the stock, predicting further pain ahead
Predicted sentiment: negative (confidence: 1.00)



In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("Photchara/stock_sentiment_Finbert_label")
tokenizer.push_to_hub("Photchara/stock_sentiment_Finbert_label")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Photchara/stock_sentiment_Finbert_label/commit/6e7f96c81707eaacb5ff35a25202ab93d87947df', commit_message='Upload tokenizer', commit_description='', oid='6e7f96c81707eaacb5ff35a25202ab93d87947df', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Photchara/stock_sentiment_Finbert_label', endpoint='https://huggingface.co', repo_type='model', repo_id='Photchara/stock_sentiment_Finbert_label'), pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

# Load fine-tuned model
model_path = "Photchara/stock_sentiment_Finbert_label"
classifier = pipeline(
    "text-classification",
    model=model_path,
    tokenizer=model_path
)

# Example prediction
text = """intel Corp. plans employment reductions and a tighter return-to-work policy in coming months, but the company, a major employer in Chandler, didn't announce specific layoff numbers in a quarterly earnings report on April 24 or in a message from its new CEO.
Media reports in recent days have suggested Intel could lay off more than 20% of its workforce, which numbered about 109,000 employees at the end of last year, including 12,000 in and around Chandler. The semiconductor giant lost $18.8 billion in 2024 amid a flagging performance."""
result = classifier(text)
print(result)

Device set to use cuda:0


[{'label': 'negative', 'score': 0.9999463558197021}]
