In [20]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import json

In [21]:
load_dotenv()
client = OpenAI()

In [22]:
def generate_response(req_index, text):
  """
  Generates a request to classify spam messages using OpenAI's API.
  Args:
      req_index (str): The index of the request.
      text (str): The text to classify.
      Returns:
      dict: A dictionary containing the request details.
  """
  return {
    "custom_id": str(req_index),
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
      "model": "gpt-4o-mini",
      "messages": [
        {
            "role": "system",
            "content": "Classify the following text as one of the following: finanical/banking spam, phishing spam, job/employment spam, malware spam, prize/lottery spam, sales-based spam, or general spam. If the text does not fit any of these categories, classify it as 'general spam'."
        },
        {
            "role": "user",
            "content": text
        }
      ],
      "max_tokens": 4096,
      "response_format": {
        "type": "json_schema",
        "json_schema": {
            "name": "spam_classification",
            "schema": {
              "type": "object",
              "properties": {
                  "classification": {
                      "type": "string",
                      "enum": [
                          "financial/banking spam",
                          "phishing spam",
                          "job/employment spam",
                          "malware spam",
                          "prize/lottery spam",
                          "sales-based spam",
                          "general spam"
                      ]
                  }
              },
              "required": ["classification"],
              "additionalProperties": False
            },
            "strict": True
        },
    }
    }
  }


In [23]:
def create_batch_file(csv_file, output_file):
    """
    Create a batch file for OpenAI API requests from a CSV file.
    Args:
        csv_file (str): Path to the input CSV file.
        output_file (str): Path to the output batch file.
    """
    raw_data = pd.read_csv(csv_file)
    raw_data = raw_data.reset_index()  # make sure indexes pair with number of rows

    batch_data = []

    for index, row in raw_data.iterrows():
        if row["label_num"] == 1:
            response = generate_response(
                index,
                row["text"]
            )
            batch_data.append(json.dumps(response))

    with open(output_file, "w") as f:
        for item in batch_data:
            f.write("%s\n" % item)

In [24]:
def process_batch_file(batch_file):
    """
    Process the batch file and extract the classification results.
    Args:
        batch_file (str): Path to the batch output file.
        Returns:
        dict: A dictionary mapping request indices to their classifications.
    """
    request_class_mapping = {}
    with open(batch_file, "r") as f:
        lines = f.readlines()

    for line in lines:
        response = json.loads(line)
        req_index = response["custom_id"]
        classification = json.loads(response["response"]["body"]["choices"][0]["message"]["content"])["classification"]
        request_class_mapping[req_index] = classification
        # print(f"Request Index: {req_index}, Classification: {classification}")
    return request_class_mapping

In [28]:
def apply_classifications(csv_file, request_class_mapping):
    """
    Apply the classifications to the original CSV file and save the results.
    Args:
        csv_file (str): Path to the input CSV file.
        request_class_mapping (dict): A dictionary mapping request indices to their classifications.
    """
    raw_data = pd.read_csv(csv_file)
    raw_data = raw_data.reset_index()  # make sure indexes pair with number of rows

    for index, row in raw_data.iterrows():
        if row["label_num"] == 1:
            classification = request_class_mapping.get(str(index), "unknown")
            raw_data.at[index, "classification"] = classification
        else:
            raw_data.at[index, "classification"] = "ham"

    raw_data.to_csv("classified_spam+sales.csv", index=False)

In [8]:
create_batch_file("test_data.csv", "test_batch_input.jsonl")

In [9]:
print(process_batch_file("test_batch_output.jsonl"))

{'1': 'general spam'}


In [10]:
apply_classifications("test_data.csv", process_batch_file("test_batch_output.jsonl"))

In [11]:
create_batch_file("Raw Data/total_spam.csv", "04.05.2025_batch_input.jsonl")

In [12]:
apply_classifications("Raw Data/total_spam.csv", process_batch_file("batch_67f1a02ecdac819095c4763fab8486ef_output.jsonl"))

In [18]:
pd.read_csv("classified_spam.csv")["classification"].value_counts()

classification
ham                       15753
general spam               2199
prize/lottery spam          891
financial/banking spam      259
phishing spam               191
malware spam                157
job/employment spam         157
Name: count, dtype: int64

In [30]:
pd.read_csv("classified_spam+sales.csv")["classification"].value_counts()

classification
ham                       15753
sales-based spam           1310
general spam               1148
prize/lottery spam          839
phishing spam               192
financial/banking spam      172
job/employment spam         110
malware spam                 83
Name: count, dtype: int64

In [27]:
print(process_batch_file("batch_680a6b0e022481909d9032cc2d1633f5_output.jsonl"))

{'3': 'general spam', '7': 'sales-based spam', '10': 'sales-based spam', '11': 'general spam', '13': 'general spam', '15': 'sales-based spam', '24': 'sales-based spam', '35': 'sales-based spam', '40': 'general spam', '41': 'phishing spam', '46': 'sales-based spam', '47': 'phishing spam', '48': 'sales-based spam', '50': 'prize/lottery spam', '60': 'sales-based spam', '68': 'financial/banking spam', '70': 'sales-based spam', '71': 'prize/lottery spam', '74': 'sales-based spam', '75': 'general spam', '85': 'sales-based spam', '90': 'general spam', '91': 'sales-based spam', '92': 'sales-based spam', '93': 'general spam', '94': 'general spam', '98': 'sales-based spam', '103': 'sales-based spam', '104': 'general spam', '106': 'sales-based spam', '109': 'general spam', '110': 'sales-based spam', '115': 'general spam', '124': 'general spam', '126': 'sales-based spam', '128': 'sales-based spam', '129': 'general spam', '134': 'sales-based spam', '135': 'phishing spam', '137': 'financial/banking 

In [26]:
create_batch_file("Raw Data/total_spam.csv", "04.24.2025_batch_input.jsonl")

In [29]:
apply_classifications("Raw Data/total_spam.csv", process_batch_file("batch_680a6b0e022481909d9032cc2d1633f5_output.jsonl"))