In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!ls "/content/drive/My Drive/Projet IA"

balanced_dataset_1000.csv	   dataset_creation.ipynb		     model_save
balanced_dataset.csv		   harassment_model_save		     model_test.ipynb
balanced_filtered_dataset.csv	   harassment_results			     Model_training.ipynb
balanced_harassment_1000.csv	   logs					     results
balanced_harassment_dataset.csv    Model_classification_training.ipynb
binary_classification_dataset.csv  Model_MultiClassification_training.ipynb


In [None]:
# Custom Dataset class to handle input encodings and labels
class HarassmentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Function to compute metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
def main(path, checkpoint=False):
    data_path = path+"/binary_classification_dataset.csv"
    df = pd.read_csv(data_path)

    texts = df['text'].tolist()
    labels = df['label'].tolist()

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    model_name = "microsoft/deberta-v3-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = HarassmentDataset(train_texts, train_labels, tokenizer)
    val_dataset = HarassmentDataset(val_texts, val_labels, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=path+"/harassment_results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir=path+"/logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=2,
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    if checkpoint:
        trainer.train(resume_from_checkpoint=True)
    else:
        trainer.train()

    model.save_pretrained(path+"/harassment_model_save")
    tokenizer.save_pretrained(path+"/harassment_model_save")

    metrics = trainer.evaluate()
    print("Evaluation Metrics:", metrics)

In [None]:
# Entry point for script
if __name__ == "__main__":
    main(path="/content/drive/My Drive/Projet IA",checkpoint=True)

NameError: name 'main' is not defined

TESTING


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

model_dir = "/content/drive/My Drive/Projet IA/harassment_model_save"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

model.eval()

custom_sentences = [
    "You're son of a bitch and I will puch you if you don't stop",
    "I want to kill myself tonight",
    "I want to punch you, your bitch!",
    "Thank you for helping me out, you're so kind.",
    "you're useless and not worth a single penny, you should leave the company immediately!!!",
    "I think you could improve your product because as of now is not good, but there's still margin",
    "&gt; Also your high beams are for dark roads and when there are no other people on them.\n\nAnd when there's no fog.\n\nAlso, SHUT DOWN THOSE FUCKING FOGLAMPS WHEN THERE'S NO FOG"
]

inputs = tokenizer(
    custom_sentences,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)

for i, sentence in enumerate(custom_sentences):
    probs = probabilities[i]
    predicted_class = torch.argmax(probs).item()
    print(f"Input: {sentence}")
    print(f"  Harassment Probability: {probs[1]:.4f}")
    print(f"  Not Harassment Probability: {probs[0]:.4f}")
    print(f"  Prediction: {'Harassment' if predicted_class == 1 else 'Not Harassment'}\n")


Input: You're son of a bitch and I will puch you if you don't stop
  Harassment Probability: 0.9998
  Not Harassment Probability: 0.0002
  Prediction: Harassment

Input: I want to kill myself tonight
  Harassment Probability: 0.9998
  Not Harassment Probability: 0.0002
  Prediction: Harassment

Input: I want to punch you, your bitch!
  Harassment Probability: 0.9997
  Not Harassment Probability: 0.0003
  Prediction: Harassment

Input: Thank you for helping me out, you're so kind.
  Harassment Probability: 0.0000
  Not Harassment Probability: 1.0000
  Prediction: Not Harassment

Input: you're useless and not worth a single penny, you should leave the company immediately!!!
  Harassment Probability: 0.9982
  Not Harassment Probability: 0.0018
  Prediction: Harassment

Input: I think you could improve your product because as of now is not good, but there's still margin
  Harassment Probability: 0.0000
  Not Harassment Probability: 1.0000
  Prediction: Not Harassment

Input: &gt; Also your

In [None]:
!nvidia-smi

Fri Jan 17 16:11:54 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!curl ipinfo.io

{
  "ip": "34.139.159.179",
  "hostname": "179.159.139.34.bc.googleusercontent.com",
  "city": "North Charleston",
  "region": "South Carolina",
  "country": "US",
  "loc": "32.8546,-79.9748",
  "org": "AS396982 Google LLC",
  "postal": "29415",
  "timezone": "America/New_York",
  "readme": "https://ipinfo.io/missingauth"
}