In [1]:
!pip install transformers datasets scikit-learn pandas torch tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.2
    Uninstalling transformers-4.52.2:
      Successfully uninstalled transformers-4.52.2
Successfully installed transformers-4.52.4


# **Dataset Preprocessing**

In [3]:
import pandas as pd

train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

# Quick check
print(train_df.head())


   Unique ID                                               Post  \
0          1  मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...   
1          2  सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...   
2          3  सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील...   
3          4  @prabhav218 साले जेएनयू छाप कमिने लोग हिन्दुओं...   
4          5  #unlock4guidelines - अनलॉक-4 के लिए गाइडलाइन्स...   

             Labels Set  
0        hate,offensive  
1           non-hostile  
2           non-hostile  
3  defamation,offensive  
4           non-hostile  


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert comma-separated string to list
def convert_labels(x):
    return x.split(',')

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_df["Labels Set"].apply(convert_labels))
val_labels = mlb.transform(val_df["Labels Set"].apply(convert_labels))
test_labels = mlb.transform(test_df["Labels Set"].apply(convert_labels))


In [5]:
test_labels

array([[1, 1, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

# **Model Development**

In [6]:
from transformers import AutoTokenizer

# Choose model
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(batch):
    return tokenizer(batch["Post"], padding=True, truncation=True, max_length=256)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
import torch
from torch.utils.data import Dataset

class HostilityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HostilityDataset(train_df["Post"], train_labels, tokenizer)
val_dataset = HostilityDataset(val_df["Post"], val_labels, tokenizer)
test_dataset = HostilityDataset(test_df["Post"], test_labels, tokenizer)


In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    # Changed evaluation_strategy to eval_strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

from sklearn.metrics import f1_score

def compute_metrics(p):
    preds = (p.predictions > 0.5).astype(int)
    return {"f1": f1_score(p.label_ids, preds, average="macro")}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnitikaahlawat3[0m ([33mnitikaahlawat3-thapar-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,0.3057,0.293708,0.181242
2,0.2392,0.286318,0.311405
3,0.2327,0.244559,0.404897
4,0.2114,0.226994,0.347032
5,0.1813,0.217755,0.447125


TrainOutput(global_step=3580, training_loss=0.24440907499643677, metrics={'train_runtime': 1732.9191, 'train_samples_per_second': 16.527, 'train_steps_per_second': 2.066, 'total_flos': 3767851800084480.0, 'train_loss': 0.24440907499643677, 'epoch': 5.0})

In [10]:
predictions = trainer.predict(test_dataset)
preds = (predictions.predictions > 0.5).astype(int)

# Evaluate
print("F1 Score:", f1_score(test_labels, preds, average="macro"))


F1 Score: 0.4473030420747408


In [11]:
model.save_pretrained("./hostility_model")
tokenizer.save_pretrained("./hostility_model")


('./hostility_model/tokenizer_config.json',
 './hostility_model/special_tokens_map.json',
 './hostility_model/sentencepiece.bpe.model',
 './hostility_model/added_tokens.json',
 './hostility_model/tokenizer.json')

# **Model Predictions**

In [12]:
!pip install fastapi uvicorn python-multipart transformers torch scikit-learn


Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.3-py3-none-any.whl.metadata (6.5 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.3-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Downloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected 

In [13]:
import pickle

with open("mlb.pkl", "wb") as f:
    pickle.dump(mlb, f)


In [14]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pickle
import numpy as np

# Load model and tokenizer
# Change MODEL_PATH to the correct save directory
MODEL_PATH = "./hostility_model"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model.eval()

# Load label binarizer
with open("mlb.pkl", "rb") as f:
    mlb = pickle.load(f)

# Set up FastAPI app
app = FastAPI(title="Hostility Detection API")

# Request body format
class InputText(BaseModel):
    text: str
@app.post("/predict")
def predict(input: InputText):
    try:
        # Step 1: Tokenize
        inputs = tokenizer(input.text, return_tensors="pt", truncation=True, padding=True, max_length=256)

        # Step 2: Get model output
        with torch.no_grad():
            outputs = model(**inputs)
            logits = torch.sigmoid(outputs.logits)

        # Step 3: Convert to binary predictions
        preds = (logits > 0.5).int().numpy()

        # Step 4: Decode labels
        labels = mlb.inverse_transform(preds)[0]

        return {"labels": labels}

    except Exception as e:
        return {"error": str(e)}


In [15]:
print(mlb.classes_)  # Should list all labels like 'defamation', 'hate', etc.

['defamation' 'fake' 'hate' 'non-hostile' 'offensive']


# **Model Evaluation**

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import torch
import numpy as np

# Assuming you have test_texts and test_labels (binarized with mlb)
# If not, load test set and transform labels:
# test_labels = mlb.transform(raw_test_labels)

def get_predictions(texts):
    model.eval()
    preds = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs.logits)
            pred = (probs > 0.5).int().numpy()
            preds.append(pred[0])
    return np.array(preds)

# Get predictions
test_texts = test_df["Post"].tolist()
predictions = get_predictions(test_texts)

# Metrics
print("Accuracy (per label):", accuracy_score(test_labels, predictions))
print("Precision:", precision_score(test_labels, predictions, average='macro', zero_division=0))
print("Recall:", recall_score(test_labels, predictions, average='macro', zero_division=0))
print("F1 Score:", f1_score(test_labels, predictions, average='macro', zero_division=0))

# Classification Report
print("\nClassification Report:")
print(classification_report(test_labels, predictions, target_names=mlb.classes_))


Accuracy (per label): 0.6799758015728977
Precision: 0.4723194164673298
Recall: 0.4420490187539058
F1 Score: 0.45639387320454483

Classification Report:
              precision    recall  f1-score   support

  defamation       0.00      0.00      0.00       169
        fake       0.76      0.73      0.74       334
        hate       0.00      0.00      0.00       234
 non-hostile       0.98      0.95      0.96       873
   offensive       0.62      0.53      0.57       219

   micro avg       0.88      0.65      0.75      1829
   macro avg       0.47      0.44      0.46      1829
weighted avg       0.68      0.65      0.66      1829
 samples avg       0.72      0.70      0.70      1829



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Model Deployement**

In [17]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit<0.14.0,>=0.

In [18]:
# app.py
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pickle

# Load model, tokenizer, mlb
model = AutoModelForSequenceClassification.from_pretrained("hostility_model")
tokenizer = AutoTokenizer.from_pretrained("hostility_model")
with open("mlb.pkl", "rb") as f:
    mlb = pickle.load(f)

model.eval()

def predict(text):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int().numpy()
        labels = mlb.inverse_transform(preds)[0]
        return ", ".join(labels) if labels else "No hostility detected."
    except Exception as e:
        return f"Error: {str(e)}"

gr.Interface(fn=predict, inputs="text", outputs="text", title="Hostility Detection").launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://30d3472c378efa4b0f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


