In [1]:
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, confusion_matrix, make_scorer
from sklearn import svm
import numpy as np
import pandas as pd

# Assuming you have your features as X and target variable as y from the dataset
# Load the data
df = pd.read_csv('./data/HTRU_2.csv', header=None)
df.columns = ['IpMean', 'IpDev', 'IpKurt','IpSkew', 'DMMean', 'DMDev', 'DMKurt', 'DMSkew', 'Class']

# Split the data into features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]


# Train = first 200 samples
# Test = from 200 to 600

X_train = X.iloc[:200, :]
y_train = y.iloc[:200]

X_test = X.iloc[200:600, :]
y_test = y.iloc[200:600]


# Initialize the model
model = svm.SVC(kernel='linear', C=1.0)  # Adjust kernel and C as necessary

# Define the cross-validation scheme
cv = StratifiedKFold(n_splits=10)

# Define custom scorers
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    return specificity

def negative_prediction_value_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    npv = tn / (tn+fn)
    return npv

def gmean_score(y_true, y_pred):
    rec = recall_score(y_true, y_pred)
    spec = specificity_score(y_true, y_pred)
    return np.sqrt(rec * spec)

def informedness_score(y_true, y_pred):
    rec = recall_score(y_true, y_pred)
    spec = specificity_score(y_true, y_pred)
    return rec + spec - 1

# Create a dictionary of scorers
scorers = {
    'accuracy': make_scorer(accuracy_score),
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'recall': make_scorer(recall_score),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score),
    'npv': make_scorer(negative_prediction_value_score),
    'gmean': make_scorer(gmean_score),
    'informedness': make_scorer(informedness_score)
}

# Perform cross-validation and collect metrics
scores = cross_validate(model, X, y, scoring=scorers, cv=cv, return_train_score=False)

# Calculate mean and standard deviation for each metric
results = {metric: {"mean": np.mean(scores['test_'+metric]), "std": np.std(scores['test_'+metric])} for metric in scorers}

# Display the results
for metric, value in results.items():
    print(f"{metric.capitalize()}: {value['mean']:.3f} ± {value['std']:.3f}")


Accuracy: 0.979 ± 0.004
Balanced_accuracy: 0.907 ± 0.022
Recall: 0.820 ± 0.044
Specificity: 0.995 ± 0.002
Precision: 0.944 ± 0.025
Npv: 0.982 ± 0.004
Gmean: 0.903 ± 0.024
Informedness: 0.815 ± 0.043


In [4]:
X

Unnamed: 0,IpMean,IpDev,IpKurt,IpSkew,DMMean,DMDev,DMKurt,DMSkew
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306
...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910


In [16]:
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, confusion_matrix, make_scorer
from sklearn import svm
import numpy as np
import pandas as pd

# Assuming you have your features as X and target variable as y from the dataset
# Load the data
df = pd.read_csv('./data/HTRU_2.csv', header=None)
df.columns = ['IpMean', 'IpDev', 'IpKurt','IpSkew', 'DMMean', 'DMDev', 'DMKurt', 'DMSkew', 'Class']

# Split the data into features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train = first 200 samples
# Test = from 200 to 600

X_train = X.iloc[:200, :]
y_train = y.iloc[:200]

X_test = X.iloc[200:600, :]
y_test = y.iloc[200:600]

# Initialize the model
model = svm.SVC(kernel='linear', C=1.0)  # Adjust kernel and C as necessary

# Define the cross-validation scheme
cv = StratifiedKFold(n_splits=10)

# Define custom scorers
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    return specificity

def negative_prediction_value_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    npv = tn / (tn+fn)
    return npv

def gmean_score(y_true, y_pred):
    rec = recall_score(y_true, y_pred)
    spec = specificity_score(y_true, y_pred)
    return np.sqrt(rec * spec)

def informedness_score(y_true, y_pred):
    rec = recall_score(y_true, y_pred)
    spec = specificity_score(y_true, y_pred)
    return rec + spec - 1

# Create a dictionary of scorers
scorers = {
    'accuracy': make_scorer(accuracy_score),
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'recall': make_scorer(recall_score),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score),
    'npv': make_scorer(negative_prediction_value_score),
    'gmean': make_scorer(gmean_score),
    'informedness': make_scorer(informedness_score)
}



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.970 ± 0.033
Balanced_accuracy: 0.747 ± 0.253
Recall: 0.500 ± 0.500
Specificity: 0.995 ± 0.016
Precision: 0.500 ± 0.500
Npv: 0.975 ± 0.025
Gmean: 0.500 ± 0.500
Informedness: 0.495 ± 0.505


In [None]:
# Perform cross-validation and collect metrics
scores = cross_validate(model, X_train, y_train, scoring=scorers, cv=cv, return_train_score=False)

# Calculate mean and standard deviation for each metric
results = {metric: {"mean": np.mean(scores['test_'+metric]), "std": np.std(scores['test_'+metric])} for metric in scorers}

# Display the results
for metric, value in results.items():
    print(f"{metric.capitalize()}: {value['mean']:.3f} ± {value['std']:.3f}")

In [15]:
# test the model

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.3f}")
print(f"Balanced accuracy: {balanced_accuracy:.3f}")
print(f"Recall: {recall:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Confusion matrix:\n{conf_matrix}")
print(f"Specificity: {specificity_score(y_test, y_pred):.3f}")
print(f"NPV: {negative_prediction_value_score(y_test, y_pred):.3f}")
print(f"G-Mean: {gmean_score(y_test, y_pred):.3f}")

Accuracy: 0.968
Balanced accuracy: 0.887
Recall: 0.780
Precision: 0.951
Confusion matrix:
[[348   2]
 [ 11  39]]
Specificity: 0.994
NPV: 0.969
G-Mean: 0.881
