# **CTI Relevance Classifier with new data**

## connect with google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install Requirements

In [None]:
!pip install transformers torch



In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Load tokenizer & model

In [None]:
MODEL_PATH = "/content/drive/MyDrive/distilbert_cti_model"
CSV_PATH = "/content/drive/MyDrive/reddit_new_ml_ready.csv"
OUTPUT_RELEVANT = "/content/drive/MyDrive/relevant_posts.csv"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Load CSV for batch prediction

In [None]:
df = pd.read_csv(CSV_PATH)

if "clean_text" not in df.columns:
    raise ValueError("CSV file must have a 'clean_text' column.")

batch_size = 16
predictions = []

for i in range(0, len(df), batch_size):
    batch_texts = df["clean_text"].iloc[i:i+batch_size].tolist()
    encodings = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True)

    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(batch_preds)

df["label"] = predictions
df_relevant = df[df["label"] == 1].reset_index(drop=True)
df_relevant.to_csv(OUTPUT_RELEVANT, index=False)

print(f"✅ Relevant posts saved to: {OUTPUT_RELEVANT}")

✅ Relevant posts saved to: /content/drive/MyDrive/relevant_posts.csv
