In [10]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import json

In [11]:
load_dotenv()
client = OpenAI()

In [76]:
def generate_response(req_index, text):
  """
  Generates a request to classify spam messages using OpenAI's API.
  Args:
      req_index (str): The index of the request.
      text (str): The text to classify.
      Returns:
      dict: A dictionary containing the request details.
  """
  return {
    "custom_id": str(req_index),
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
      "model": "gpt-4o-mini",
      "messages": [
        {
            "role": "system",
            "content": "Classify the following text as one of the following: finanical/banking spam, phishing spam, job/employment spam, malware spam, prize/lottery spam, or general spam. If the text does not fit any of these categories, classify it as 'general spam'."
        },
        {
            "role": "user",
            "content": text
        }
      ],
      "max_tokens": 4096,
      "response_format": {
        "type": "json_schema",
        "json_schema": {
            "name": "spam_classification",
            "schema": {
              "type": "object",
              "properties": {
                  "classification": {
                      "type": "string",
                      "enum": [
                          "financial/banking spam",
                          "phishing spam",
                          "job/employment spam",
                          "malware spam",
                          "prize/lottery spam",
                          "general spam"
                      ]
                  }
              },
              "required": ["classification"],
              "additionalProperties": False
            },
            "strict": True
        },
    }
    }
  }


In [77]:
def create_batch_file(csv_file, output_file):
    """
    Create a batch file for OpenAI API requests from a CSV file.
    Args:
        csv_file (str): Path to the input CSV file.
        output_file (str): Path to the output batch file.
    """
    raw_data = pd.read_csv(csv_file)
    raw_data = raw_data.reset_index()  # make sure indexes pair with number of rows

    batch_data = []

    for index, row in raw_data.iterrows():
        if row["label_num"] == 1:
            response = generate_response(
                index,
                row["text"]
            )
            batch_data.append(json.dumps(response))

    with open(output_file, "w") as f:
        for item in batch_data:
            f.write("%s\n" % item)

In [88]:
def process_batch_file(batch_file):
    """
    Process the batch file and extract the classification results.
    Args:
        batch_file (str): Path to the batch output file.
        Returns:
        dict: A dictionary mapping request indices to their classifications.
    """
    request_class_mapping = {}
    with open(batch_file, "r") as f:
        lines = f.readlines()

    for line in lines:
        response = json.loads(line)
        req_index = response["custom_id"]
        classification = json.loads(response["response"]["body"]["choices"][0]["message"]["content"])["classification"]
        request_class_mapping[req_index] = classification
        # print(f"Request Index: {req_index}, Classification: {classification}")
    return request_class_mapping

In [83]:
def apply_classifications(csv_file, request_class_mapping):
    """
    Apply the classifications to the original CSV file and save the results.
    Args:
        csv_file (str): Path to the input CSV file.
        request_class_mapping (dict): A dictionary mapping request indices to their classifications.
    """
    raw_data = pd.read_csv(csv_file)
    raw_data = raw_data.reset_index()  # make sure indexes pair with number of rows

    for index, row in raw_data.iterrows():
        if row["label_num"] == 1:
            classification = request_class_mapping.get(str(index), "unknown")
            raw_data.at[index, "classification"] = classification
        else:
            raw_data.at[index, "classification"] = "ham"

    raw_data.to_csv("classified_spam.csv", index=False)

In [78]:
create_batch_file("test_data.csv", "test_batch_input.jsonl")

In [94]:
print(process_batch_file("test_batch_output.jsonl"))

{'1': 'general spam'}


In [None]:
apply_classifications("test_data.csv", process_batch_file("test_batch_output.jsonl"))

Request Index: 1, Classification: general spam


In [79]:
create_batch_file("Raw Data/total_spam.csv", "04.05.2025_batch_input.jsonl")

In [91]:
apply_classifications("Raw Data/total_spam.csv", process_batch_file("batch_67f1a02ecdac819095c4763fab8486ef_output.jsonl"))

In [93]:
pd.read_csv("classified_spam.csv").head(40)

Unnamed: 0,index,text,label,label_num,source,type,classification
0,0,Subject: enron methanol ; meter # : 988291\r\n...,ham,0,spam_mails,email,ham
1,1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham,0,spam_mails,email,ham
2,2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham,0,spam_mails,email,ham
3,3,"Subject: photoshop , windows , office . cheap ...",spam,1,spam_mails,email,general spam
4,4,Subject: re : indian springs\r\nthis deal is t...,ham,0,spam_mails,email,ham
5,5,Subject: ehronline web address change\r\nthis ...,ham,0,spam_mails,email,ham
6,6,Subject: spring savings certificate - take 30 ...,ham,0,spam_mails,email,ham
7,7,Subject: looking for medication ? we ` re the ...,spam,1,spam_mails,email,general spam
8,8,Subject: noms / actual flow for 2 / 26\r\nwe a...,ham,0,spam_mails,email,ham
9,9,"Subject: nominations for oct . 21 - 23 , 2000\...",ham,0,spam_mails,email,ham


In [89]:
print(process_batch_file("batch_67f1a02ecdac819095c4763fab8486ef_output.jsonl"))

{'3': 'general spam', '7': 'general spam', '10': 'financial/banking spam', '11': 'general spam', '13': 'general spam', '15': 'financial/banking spam', '24': 'financial/banking spam', '35': 'general spam', '40': 'general spam', '41': 'phishing spam', '46': 'general spam', '47': 'phishing spam', '48': 'general spam', '50': 'prize/lottery spam', '60': 'financial/banking spam', '68': 'financial/banking spam', '70': 'general spam', '71': 'prize/lottery spam', '74': 'general spam', '75': 'general spam', '85': 'general spam', '90': 'malware spam', '91': 'financial/banking spam', '92': 'general spam', '93': 'general spam', '94': 'general spam', '98': 'general spam', '103': 'general spam', '104': 'general spam', '106': 'general spam', '109': 'general spam', '110': 'general spam', '115': 'general spam', '124': 'general spam', '126': 'general spam', '128': 'general spam', '129': 'general spam', '134': 'general spam', '135': 'phishing spam', '137': 'financial/banking spam', '138': 'general spam', 