<a href="https://colab.research.google.com/github/Tanish-analyst/financial-sentiment-analyzer/blob/main/Source%20code/fine_tuned_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel("/content/chatgpt senti.xlsx")
df

Unnamed: 0,headlines,sentiment
0,Company profits surge 25 percent in latest qua...,positive
1,Stock market hits record high amid investor op...,positive
2,Federal Reserve signals confidence in economic...,positive
3,Unemployment rate drops to lowest level in a d...,positive
4,Tech sector leads Wall Street rally with stron...,positive
...,...,...
2995,Global trade organization maintains current ta...,neutral
2996,Business mentorship program expands outreach e...,neutral
2997,Healthcare information systems firm enhances i...,neutral
2998,Industrial tech manufacturer maintains robotic...,neutral


In [None]:
label_mapping = {"neutral": 0, "positive": 1, "negative": 2}
df['sentiment'] = df['sentiment'].map(label_mapping)
df

Unnamed: 0,headlines,sentiment
0,Company profits surge 25 percent in latest qua...,1
1,Stock market hits record high amid investor op...,1
2,Federal Reserve signals confidence in economic...,1
3,Unemployment rate drops to lowest level in a d...,1
4,Tech sector leads Wall Street rally with stron...,1
...,...,...
2995,Global trade organization maintains current ta...,0
2996,Business mentorship program expands outreach e...,0
2997,Healthcare information systems firm enhances i...,0
2998,Industrial tech manufacturer maintains robotic...,0


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset=dataset.shuffle(seed=42)

In [None]:
dataset

Dataset({
    features: ['headlines', 'sentiment'],
    num_rows: 3000
})

In [None]:
dataset = dataset.rename_column("sentiment", "label")

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2)

# Access the train and test datasets
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [None]:
train_dataset,test_dataset

(Dataset({
     features: ['headlines', 'label'],
     num_rows: 2400
 }),
 Dataset({
     features: ['headlines', 'label'],
     num_rows: 600
 }))

In [None]:
from transformers import AutoTokenizer

model_name = "yiyanghkust/finbert-tone"  # Or any other model
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example["headlines"], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
tokenized_train

Dataset({
    features: ['headlines', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2400
})

In [None]:
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True
)


In [None]:
%pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrajualbum295[0m ([33mrajualbum295-maharaja-surajmal-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.108795,0.963333
2,No log,0.136546,0.963333
3,No log,0.149948,0.97


TrainOutput(global_step=450, training_loss=0.16506856282552082, metrics={'train_runtime': 164.357, 'train_samples_per_second': 43.807, 'train_steps_per_second': 2.738, 'total_flos': 43783717168800.0, 'train_loss': 0.16506856282552082, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/kkkkkjjjjjj/results/commit/0f934f13d8e8bb30bd78d715f29f9d3e4fa4de8d', commit_message='End of training', commit_description='', oid='0f934f13d8e8bb30bd78d715f29f9d3e4fa4de8d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kkkkkjjjjjj/results', endpoint='https://huggingface.co', repo_type='model', repo_id='kkkkkjjjjjj/results'), pr_revision=None, pr_num=None)