# Import dataset

In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.read_csv("security_incidents_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   incident_id  100 non-null    float64
 1   description  110 non-null    object 
 2   priority     110 non-null    object 
dtypes: float64(1), object(2)
memory usage: 2.7+ KB


### As the dataset is small, the models are likely to get overfitted. To reduce this, I am choosing a model with lower complexity.

- I am took four classification models
  - LogisticRegression
  - SVC
  - RandomForestClassifier
  - GaussianNB


- selecting a model with best score

In [14]:
# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Import modules for model evaluation and embeddings
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

In [18]:

# Encode the description
X = df["description"].tolist()
y = df["priority"]

embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_emb = embedder.encode(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split( X_emb, y, test_size=0.2, stratify=y, random_state=42)


# Defining the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gaussian NB": GaussianNB()
}


best_score = 0
best_model_name = None
best_model = None

for name, model in models.items():
    scores = cross_val_score(
        model, X_train, y_train,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring="f1_macro"
    )
    print(f"{name} CV F1 scores: {scores}, Average: {scores.mean():.4f}")

    if scores.mean() > best_score:
        best_score = scores.mean()
        best_model_name = name
        best_model = model

print(f"\nBest model: {best_model_name} with CV F1: {best_score:.4f}")

# Train the best model with full training set
best_model.fit(X_train, y_train)
y_pred_test = best_model.predict(X_test)

print("\nTest Set Evaluation:")
print(classification_report(y_test, y_pred_test))


Logistic Regression CV F1 scores: [1.         1.         1.         1.         0.94035088], Average: 0.9881
SVM CV F1 scores: [1.         1.         1.         1.         0.94035088], Average: 0.9881
Random Forest CV F1 scores: [1.         0.8875     1.         0.94035088 0.75714286], Average: 0.9170
Gaussian NB CV F1 scores: [1. 1. 1. 1. 1.], Average: 1.0000

Best model: Gaussian NB with CV F1: 1.0000

Test Set Evaluation:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        11
         Low       1.00      1.00      1.00        11

    accuracy                           1.00        22
   macro avg       1.00      1.00      1.00        22
weighted avg       1.00      1.00      1.00        22



### It is clear that the models are overfitting the data, as the dataset was generated using ChatGPT. To overcome this problem, we should increase the dataset size. For now, this is the best approach.

In [19]:
# testing with new samples
new_samples = [
    "Unknown person loitering near the front gate", # it is high
    "Delivery person left package at the doorstep" # It is Low
]
new_emb = embedder.encode(new_samples)
preds = best_model.predict(new_emb)
print("Predictions for new incidents:", preds)

Predictions for new incidents: ['High' 'Low']


## Lets create a pipeline - to streamline preprocessing, feature extraction, and model training.

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sentence_transformers import SentenceTransformer

In [21]:
def embed_text(texts):
    return embedder.encode(texts)

embedding_transformer = FunctionTransformer(embed_text, validate=False)


In [22]:
pipeline = Pipeline([
    ("embeddings", embedding_transformer),
    ("classifier", best_model)
])


In [23]:
pipeline.predict(new_samples)

array(['High', 'Low'], dtype='<U4')

## Lets save the pipeline

###lets call our model the "Security Description Classifier" - SDC."

In [25]:
import joblib

joblib.dump(pipeline, "SDC_pipeline.pkl")
print("Pipeline saved as 'SDC_pipeline.pkl'")

Pipeline saved as 'SDC_pipeline.pkl'
