In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd

# Load dataset
data = pd.read_csv("complaints.csv")

# Data Preprocessing
data['crimeaditionalinfo'] = data['crimeaditionalinfo'].str.lower().str.replace(r'[^\w\s]+', '')
data = data.dropna(subset=['category', 'sub_category', 'crimeaditionalinfo'])

# Split the data
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['category'])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Load Pre-trained Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(data['category'].unique()))

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["crimeaditionalinfo"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set up Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()
