# Python code to train an ML model for article type classification
The input dataset should include the following columns:
- 'type': The classification label for each article.
- 'body': The web scraped body content of the article.
- 'title': The article title. Used as a fallback when the body content is empty.

# Set up

In [None]:
# All imports used
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Choose the ML model type you prefer
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_validate

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

In [None]:
# Replace 'dataset.csv' with the actual path to your dataset CSV file
data = pd.read_csv('dataset.csv')

# View the first few rows of the DataFrame (to check import)
data.head()

In [None]:
# Update column names to match the dataset format
type_col_name = 'Type'   # Column for the initial type classification
body_col_name = 'body'   # Column for the web scraped article body content
title_col_name = 'Title' # Column for the article title (used as fallback if body is empty)

class_col_name = 'class' # Column for the 'fixed type' (regrouped types)

# Imporve data

## Regroup types to reduce the number of categories
This is done to prevent an excessive number of type classes, which could negatively impact model performance.

In [None]:
# Update the desired data types and corresponding data names based on your dataset
def fix_type(type):
  type = str(type)
  type = type.lower().strip()
  if "malware" in type or "spyware" in type:
    return "Malware"
  elif "information" in type:
    return "Informational"
  elif "ransomware" in type:
    return "Ransomware"
  elif "phishing" in type:
    return "Phishing"
  elif "data leak" in type or "data breach" in type or "data stealing" in type:
    return "Data leak"
  elif "ai" in type:
    return "AI"
  elif "vulnerability" in type or "vulerability" in type or "vulnerabilities" in type:
    return "Vulnerability"
  else:
    return "Other"

In [None]:
# Create a new column with the fixed (regrouped) types
data[class_col_name] = data[type_col_name].apply(fix_type)

In [None]:
# View the first few rows of the DataFrame (to check the new fixed types column)
data.head()

## Handle empty body content
Replace empty body entries with the Title value to provide the ML model with usable data for training.

In [None]:
# Replace column with empty body with title name
data[body_col_name] = data.apply(lambda row: row[title_col_name] if row[body_col_name].strip() == '" "' else row[body_col_name], axis=1)
data[body_col_name] = data.apply(lambda row: row[title_col_name] if row[body_col_name].strip() == '""' else row[body_col_name], axis=1)

In [None]:
# View the first few rows of the DataFrame (to check the updated body column)
data.head()

#Set up classification

## Set up pipelines

In [None]:
# Create pipeline
text_clf = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', LinearSVC()), # replace with your chosen model
])

In [None]:
# Drop rows with missing data
data = data.dropna()

## Train / test / evaluate **(optional)**

In [None]:
fold_num = 4

In [None]:
# Method for cross validation with scoring metrics
def cross_val (model, x_set, y_set, fold_num) :
  # Select scoring / evaluation metrics
  scoring = ['accuracy', 'precision_weighted', 'precision_micro', 'precision_macro', 'recall_weighted',
             'recall_micro', 'recall_macro', 'f1_weighted', 'f1_micro', 'f1_macro']
  # Use cross_validate with the given model, x_set, y_set, fold_num and evaluation metrics
  results = cross_validate(estimator=model,
                               X=x_set,
                               y=y_set,
                               cv=fold_num,
                               scoring=scoring,
                               return_train_score=False)
  return results

In [None]:
def print_results(results):
  print("Mean Validation Accuracy", results['test_accuracy'].mean())
  print("Mean Validation Precision (weighted)", results['test_precision_weighted'].mean())
  print("Mean Validation Recall (weighted)", results['test_recall_weighted'].mean())
  print("Mean Validation F1 Score (weighted)", results['test_f1_weighted'].mean())

In [None]:
results = cross_val(text_clf, data[body_col_name], data[class_col_name], fold_num)
print_results(results)

#Train and save model

In [None]:
# Train model
text_clf.fit(data[body_col_name], data[class_col_name])

In [None]:
# Save model as pickle file
import pickle
pickle_file = 'type_ml_model.sav'

pickle.dump(text_clf, open(pickle_file, 'wb'))

In [None]:
# Save the model to a JSON file for loading into MongoDB
import json
from io import BytesIO
import base64
json_file = 'json_ml_data.json'

# Serialize the model to a binary stream using pickle
model_binary = BytesIO()
pickle.dump(text_clf, model_binary)
model_binary.seek(0)  # Rewind the binary stream to the beginning

# Convert binary data to base64 string
model_base64 = base64.b64encode(model_binary.read()).decode('utf-8')

# Create the needed JSON structure
json_ml_data = {
    'model': model_base64,
    'name': "ML model" # Retain this name to ensure consistency with the database script for loading
}

# Convert to JSON
with open(json_file, 'w') as f:
      json.dump(json_ml_data, f, default=str)  # Use default=str to handle non-serializable types