In [None]:
import pandas as pd
df=pd.read_csv("/content/cccc22.csv")
df

Unnamed: 0.1,Unnamed: 0,Complaints,Label
0,0,I was charged twice for a single transaction a...,Billing Issue
1,1,My credit card bill showed a late fee even tho...,Billing Issue
2,2,They added a mysterious charge to my statement...,Billing Issue
3,3,I canceled a subscription but still got billed...,Billing Issue
4,4,"My autopay was set up, but they still marked m...",Billing Issue
...,...,...,...
2158,2158,Card stopped working abroad even though intern...,Trouble_Using_Card
2159,2159,I cancelled a subscription but was still charg...,Billing Issue
2160,2160,Bank refused to block the card in time after r...,Fraud
2161,2161,Reward redemption is very complicated and limi...,Features


In [None]:
del df["Unnamed: 0"]
df = df.rename(columns={"Complaint": "Complaints"})
df = df.rename(columns={"Label": "label"})
df

Unnamed: 0,Complaints,label
0,I was charged twice for a single transaction a...,Billing Issue
1,My credit card bill showed a late fee even tho...,Billing Issue
2,They added a mysterious charge to my statement...,Billing Issue
3,I canceled a subscription but still got billed...,Billing Issue
4,"My autopay was set up, but they still marked m...",Billing Issue
...,...,...
2158,Card stopped working abroad even though intern...,Trouble_Using_Card
2159,I cancelled a subscription but was still charg...,Billing Issue
2160,Bank refused to block the card in time after r...,Fraud
2161,Reward redemption is very complicated and limi...,Features


In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Features,451
Trouble_Using_Card,435
Customer_service,430
Billing Issue,426
Fraud,421


In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


GPU available: True
GPU name: Tesla T4


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
# billing=0, trouble_using_card = 4, features= 2, customer_service = 1, fraud = 3

In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,451
4,435
1,430
0,426
3,421


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset=dataset.shuffle(seed=42)
dataset

Dataset({
    features: ['Complaints', 'label'],
    num_rows: 2163
})

In [None]:
split_dataset = dataset.train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [None]:
train_labels = train_dataset['label']
test_labels = test_dataset['label']


train_dist = pd.Series(train_labels).value_counts(normalize=True)
test_dist = pd.Series(test_labels).value_counts(normalize=True)

print("Train distribution (%):\n", train_dist)
print("\nTest distribution (%):\n", test_dist)

Train distribution (%):
 2    0.206358
4    0.206358
0    0.200000
1    0.193642
3    0.193642
Name: proportion, dtype: float64

Test distribution (%):
 1    0.219400
2    0.217090
3    0.198614
0    0.184758
4    0.180139
Name: proportion, dtype: float64


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts):
    return tokenizer(texts["Complaints"], truncation=True, padding="max_length", max_length=512)

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_test = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1730 [00:00<?, ? examples/s]

Map:   0%|          | 0/433 [00:00<?, ? examples/s]

In [None]:
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import numpy as np
from evaluate import load  # Changed import

accuracy_metric = load("accuracy")  # Updated from load_metric to load

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtanish01614902023[0m ([33mtanish01614902023-maharaja-surajmal-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7891,0.482834,0.884527
2,0.2862,0.262357,0.926097
3,0.1142,0.201934,0.937644


TrainOutput(global_step=327, training_loss=0.48859530632648995, metrics={'train_runtime': 564.3702, 'train_samples_per_second': 9.196, 'train_steps_per_second': 0.579, 'total_flos': 1365583159388160.0, 'train_loss': 0.48859530632648995, 'epoch': 3.0})

In [None]:
model.save_pretrained("credit_card_complaint_classifier")
tokenizer.save_pretrained("credit_card_complaint_classifier")


('credit_card_complaint_classifier/tokenizer_config.json',
 'credit_card_complaint_classifier/special_tokens_map.json',
 'credit_card_complaint_classifier/vocab.txt',
 'credit_card_complaint_classifier/added_tokens.json')

In [None]:
!huggingface-cli login

from huggingface_hub import HfApi, HfFolder, Repository, create_repo, upload_folder

# Set your repo name
repo_name = "credit-card-complaint-classifier"

# Create repo (private=False for public)
create_repo(repo_name, private=False)

# Upload folder
upload_folder(
    repo_id=f"kkkkkjjjjjj/{repo_name}",
    folder_path="credit_card_complaint_classifier",
    commit_message="Upload fine-tuned credit card complaint classifier"
)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `colab11` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `colab11`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kkkkkjjjjjj/credit-card-complaint-classifier/commit/415d6eb9c7e5134fc03f5a1aa145e3f9c2589fef', commit_message='Upload fine-tuned credit card complaint classifier', commit_description='', oid='415d6eb9c7e5134fc03f5a1aa145e3f9c2589fef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kkkkkjjjjjj/credit-card-complaint-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='kkkkkjjjjjj/credit-card-complaint-classifier'), pr_revision=None, pr_num=None)