In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('/content/drive/MyDrive/final_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/cleaned_test.csv')

train_df['cleaned_crime_info'] = train_df['cleaned_crime_info'].astype(str).fillna("")
test_df['cleaned_crime_info'] = test_df['cleaned_crime_info'].astype(str).fillna("")

label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['category'])

train_categories = set(train_df['category'])

filtered_test_df = test_df[test_df['category'].isin(train_categories)]
filtered_test_df['label'] = label_encoder.transform(filtered_test_df['category'])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(dataframe):
    return tokenizer(
        list(dataframe['cleaned_crime_info']),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_df)
filtered_test_encodings = tokenize_data(filtered_test_df)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_df['label'].tolist())
filtered_test_dataset = CustomDataset(filtered_test_encodings, filtered_test_df['label'].tolist())

num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=filtered_test_dataset,
    tokenizer=tokenizer
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df['label'] = label_encoder.transform(filtered_test_df['category'])
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.6695,0.672436
2,0.5464,0.666516


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

model.save_pretrained('./bert-fine-tuned-1')
tokenizer.save_pretrained('./bert-fine-tuned-1')
import joblib
joblib.dump(label_encoder, './bert-fine-tuned-1/label_encoder.pkl')

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

trainer.model.eval()

predictions = trainer.predict(filtered_test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

y_test = filtered_test_df['label'].tolist()

accuracy = accuracy_score(y_test, predicted_labels)

print(f"Accuracy: {accuracy}")

unique_labels_test = np.unique(y_test)

target_names_test = label_encoder.classes_[unique_labels_test]

print(classification_report(y_test, predicted_labels, target_names=target_names_test, labels=unique_labels_test))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Accuracy: 0.771880354083381
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.54      0.24      0.34      3291
Child Pornography CPChild Sexual Abuse Material CSAM       0.61      0.37      0.46       115
                                Cryptocurrency Crime       0.53      0.71      0.61       151
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        47
      Hacking  Damage to computercomputer system etc       0.37      0.36      0.36       514
                            Online Cyber Trafficking       0.00      0.00      0.00        57
                              Online Financial Fraud       0.83      0.96      0.89     17607
                            Online Gambling  Betting       0.59      0.08      0.15       118
               Online and Socia

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import torch

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return label_encoder.inverse_transform([prediction])[0]

print(predict("Sensitive financial data was leaked."))