## <center> Classification Model </center>

### <center> Polytechnic University of Leiria </center>

#### <center> Patrícia Isabel Santos Martinho </center>

Libraries

In [None]:

#from benfordslaw import benfordslaw # type: ignore
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore
from scipy.stats import chisquare
from sklearn.metrics import confusion_matrix, roc_curve, auc
from scipy.stats import chi2
from scipy.stats import entropy  # Função para calcular KL
import pandas as pd

Variable Initialization

In [None]:
colunas=1000 # features
linhas=2000 # instances
prop_linhas_manipuladas=0.3 # proportion of anomalous rows
prop_manipul_linha=0.1 # anomalies in an anomalous row
alpha=0.05
df_ficheiro="Dataset.txt" # datset file name



In [None]:
linhas_manipuladas=int(linhas*prop_linhas_manipuladas) # absolute amount of anomalous rows
print(linhas_manipuladas)
linhas_s_fraude=linhas-linhas_manipuladas # absolute amount of BL conform rows
print(linhas_s_fraude)
qtde_fraude_linhaF=int(colunas*prop_manipul_linha) # absolute amount of anomalies in an anomalous row 
qtde_ben_linhaF=int(colunas*(1-prop_manipul_linha)) # absolute amount of BL conform numbers in an anomalous row 

Functions

In [None]:
# Function to calculate the distribution expected by Benford’s Law
def distribuicao_benford(n):
    distribution = np.log10(1 + 1 / np.arange(1, 10))  # Benford’s law for digits 1 to 9
    distribution[-1] = 1 - sum(distribution[:-1])  # Adjusts the last probability
    return distribution*n

In [None]:
# Function to get the first digit of a number
def obter_primeiro_digito(numero):
    num = abs(numero)  # Work only with positive values
    if num == 0:
        return 0
    while num < 1:  # If it is a small decimal number, multiply until it has a digit in the whole part
        num *= 10
    while num >= 10:  # If it is a large number, divide until on only one digit
        num //= 10
    return int(num)  # Returns the first digit as an integer

In [None]:
# Function to save the first digits
def array_primeiros_digitos (numeros):
    primeiros_digitos=[]
    for numero in numeros:
        primeiros_digitos.append(obter_primeiro_digito(numero))
    primeiros_digitos = [x for x in primeiros_digitos if x != 0]
    return primeiros_digitos

In [None]:
# Function to calculate the frequencies of the first digits
def frequencia_primeiros_digitos(numeros):
    frequencias = np.zeros(9)  # To store the frequencies of digits 1 through 9
    for numero in numeros:
        primeiro_digito = obter_primeiro_digito(numero)
        frequencias[primeiro_digito - 1] += 1  # Increases the digit count
    return frequencias

In [None]:
# Calculate accumulated frequencies from absolute frequencies

def f_acumulada (f_relativas):
    f_acumuladas=[]
    f_acumuladas.append(f_relativas[0])
    for i in range(1,len(f_relativas)):
        f_acumuladas.append(f_relativas[i]+f_acumuladas[i-1])
    return f_acumuladas


In [None]:
# Absolute mean deviation

def calcular_mad(observadas, esperadas):
    return np.mean(np.abs(observadas - esperadas))

In [None]:
# Kolmogorov-smirnov

def calcular_ks (observadas, esperadas):

    # Calculate accumulated frequencies
    obs_acum=np.array(f_acumulada(observadas))
    esp_acum=np.array(f_acumulada(esperadas))
    
    # Calculate distance
    return np.max(np.abs(obs_acum - esp_acum))

In [None]:
# Euclidean distance

def calcular_euclidiana (observadas, esperadas):
    return np.sqrt(np.sum((observadas - esperadas) ** 2))

In [None]:
# Hellinger distance

def calcular_hellinger(observadas, esperadas):
    return np.sqrt(0.5 * np.sum((np.sqrt(observadas) - np.sqrt(esperadas))**2))

In [None]:
# Kullback-Leiber divergence

def calcular_kl(observadas, esperadas):
    # Calculate KL Divergence
    kl_value = entropy(observadas, esperadas)  # scipy.stats.entropy calculates KL when we pass two distributions
    
    return kl_value

In [None]:
# Hypothesis tests

def teste_hipoteses (teste, observacoes, numero_simulacoes):
        # Expected frequencies for Benford’s Law
    esperadas = distribuicao_benford(numero_simulacoes)
    esperadas_norm=esperadas/sum(esperadas)

    # Generate samples that follow Benford’s Law
    valores_simulados = []

    for _ in range(numero_simulacoes):
        simulados = np.random.choice(np.arange(1, 10), p=np.array(esperadas_norm), size=int(sum(observacoes)))
        freq_simuladas = [np.sum(simulados == d) for d in range(1, 10)]
        match teste:
            case "mad":
                mad_simulado = calcular_mad(freq_simuladas, esperadas)
                valores_simulados.append(mad_simulado)
            case "ks":
                ks_simulado= calcular_ks(freq_simuladas,esperadas)
                valores_simulados.append(ks_simulado)
            case "euc":
                euclidiana_simulado= calcular_euclidiana(freq_simuladas,esperadas)
                valores_simulados.append(euclidiana_simulado)
            case "hel":
                hellinger_simulado= calcular_hellinger(freq_simuladas,esperadas)
                valores_simulados.append(hellinger_simulado)
            case "kl":
                kl_simulado= calcular_kl(freq_simuladas,esperadas)
                valores_simulados.append(kl_simulado)

        

     # calculate observed value:
    match teste:
        case "mad":
            valor_observado = calcular_mad(observacoes, esperadas)
        case "ks":
            valor_observado= calcular_ks(observacoes,esperadas)
        case "euc":
            valor_observado = calcular_euclidiana(observacoes, esperadas)
        case "hel":
            valor_observado = calcular_hellinger(observacoes, esperadas)
        case "kl":
            valor_observado = calcular_kl(observacoes, esperadas)
        

    # Calculate p-value
    p_value_teste = np.mean(np.array(valores_simulados) >= valor_observado) # proportion of values higher than expected.

    return(valor_observado,p_value_teste)

In [None]:

def Fisher(p_values):

    fisher_stat = -2 * sum(np.log(p_values))
    if fisher_stat == float('-inf'):
        print(p_values)
    combined_p_value = 1 - chi2.cdf(fisher_stat, 2 * len(p_values)) # chi2.cdf -> cumulative chi2 distribution
    return combined_p_value

In [None]:
# I did this function to solve situations of incomplete confusion matrices and avoid errors

def matriz_confusao(class_real, previsto):
    # Generate the confusion matrix
    labels = [0, 1]  # Expected classes
    conf_matrix = confusion_matrix(class_real, previsto, labels=labels)

    tn, fp, fn, tp = 0, 0, 0, 0

    # Direct extraction of values if both classes exist
    if conf_matrix.shape == (2, 2):
        tn, fp, fn, tp = conf_matrix.ravel()
    else:  # If there is only one class in the data
        if 0 in class_real:
            tn = conf_matrix[0, 0] if conf_matrix.shape[0] > 0 else 0
            fp = conf_matrix[0, 1] if conf_matrix.shape[1] > 1 else 0
        if 1 in class_real:
            fn = conf_matrix[1, 0] if conf_matrix.shape[0] > 1 else 0
            tp = conf_matrix[1, 1] if conf_matrix.shape[1] > 1 else 0
    
    return tn, fp, fn, tp, conf_matrix

In [None]:
def gerar_roc(class_real, p_values, alpha_values):
    fprs = []
    tprs = []

    p_values = p_values.flatten()  # Ensure that p_values is a one-dimensional vector

    # Calculate overall ROC and AUC before loop (to avoid unnecessary calculations)
    fpr_full, tpr_full, _ = roc_curve(class_real, p_values)
    auc_value = auc(fpr_full, tpr_full)
    print(f"AUC total: {auc_value}")

    for alpha in alpha_values:  
        previsto = (p_values < alpha).astype(int) # positive values - with anomalies

        # Calculate confusion matrix
        tn, fp, fn, tp = confusion_matrix(class_real, previsto).ravel()

        # Calculate the rate of false positives and true positives
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  

        fprs.append(fpr)
        tprs.append(tpr)

    # Generate the ROC curve
    plt.figure(figsize=(6, 6))
    plt.plot(fprs, tprs, marker="o", label="ROC curve")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")  # Reference line (random model)
    plt.xlabel("1-Specificity (FPR)")
    plt.ylabel("Sensibility (TPR)")
    plt.legend()
    plt.grid()
    plt.show()

    return np.array(fprs), np.array(tprs), np.array(alpha_values)




In [None]:
# Draw chart of the proportions of the digits

def grafico_digitos (dados,nl):
    # Creation of the figure and axes
    fig, ax = plt.subplots(figsize=(12, 6))

    # Data for the Benford’s Law chart
    prop_ds = np.array(dados[nl,:-1])
         
    # Calculate the frequency of the first digits
    xv = range(1, 10)
    yv = distribuicao_benford(colunas)
    yv = yv/colunas

    # Plot the proportions according to Benford’s Law as a line
    ax.plot(xv, yv, marker='o', label="Benford's Law Distribution", linestyle='-', color='blue', alpha=0.6)

    # Notes for the values of Benford’s Law
    for i, value in enumerate(yv):
        ax.annotate(f'{value:.3f}',  # Round the annotation to 3 decimal places
                    xy=(xv[i], value),
                    xytext=(0, 5),  # Offset at the annotation position
                    textcoords='offset points',
                    ha='center',
                    va='bottom')

    # Data for the dataset chart
 
    x = list(range(1, 10)) 
    y = frequencia_primeiros_digitos(prop_ds)
    y = y/len(prop_ds)

    # Plot the proportions of the dataset as a line
    ax.plot(x, y, marker='o', label=f"Digits frequency in line nr. {nl}", linestyle='-', color='orange', alpha=0.6)

    # Annotations for the dataset values
    for i, value in enumerate(y):
        ax.annotate(f'{value}',
                    xy=(x[i], value),
                    xytext=(0, 3),
                    textcoords='offset points',
                    ha='center',
                    va='bottom')

    # Axis and subtitle settings
    ax.set_xlabel("First Digit", fontsize=14)
    ax.set_ylabel("Frequency", fontsize=14)
    ax.legend(fontsize=14)
    ax.set_xticks(list(range(1, 10)))  # Defining x axis ticks
    ax.set_xticklabels(list(range(1, 10)))  # Labels for the ticks

    # display the graph
    plt.show()

In [None]:
dados_finais = np.loadtxt(df_ficheiro, delimiter=',')

In [None]:
dados_finais

In [None]:
class_real = dados_finais[:, -1]
class_real=class_real.astype(int)
print(class_real)

In [None]:
counts = np.bincount(class_real)
print("Negativos", counts[0])
print("Positivos", counts[1])

In [None]:
grafico_digitos (dados_finais,20)

In [None]:
def diagnosticar_chi_square(first_digit_frequencies, expected_distribution):
    """
    Função para diagnosticar problemas no teste chi-square
    """
    print("=== DIAGNÓSTICO CHI-SQUARE ===")
    print(f"Frequências observadas: {first_digit_frequencies}")
    print(f"Distribuição esperada: {expected_distribution}")
    print(f"Soma observadas: {np.sum(first_digit_frequencies)}")
    print(f"Soma esperadas: {np.sum(expected_distribution)}")
    print(f"Diferença: {np.sum(first_digit_frequencies) - np.sum(expected_distribution)}")
    print(f"Diferença relativa: {abs(np.sum(first_digit_frequencies) - np.sum(expected_distribution)) / np.sum(expected_distribution) * 100:.10f}%")
    

Verify compliance with the Benford's Law

In [None]:
expected_distribution = distribuicao_benford(colunas)

previsto_chi2=[]
p_values_chi2=[]

previsto_f = []
previsto_ks=[]
previsto_MAD=[]
previsto_hel=[]
previsto_euc=[]
previsto_kl=[]
p_values_ks=[]
p_values_MAD=[]
p_values_euc=[]
p_values_hel=[]
p_values_kl=[]


for n_linha in range(linhas):
    dfbl = np.array(dados_finais[n_linha,:-1])
    p_values=[]

    first_digits = array_primeiros_digitos (dfbl)
        
    first_digit_frequencies = frequencia_primeiros_digitos(dfbl)

    diagnosticar_chi_square(first_digit_frequencies, expected_distribution)
    
    # Apply the chi-square test
    chi2_stat, p_value_chi2 = chisquare(first_digit_frequencies, expected_distribution)
    p_value_chi2=1e-15 if p_value_chi2 <(1e-15) else p_value_chi2
    predicted_label_chi2 = 1 if p_value_chi2 < alpha else 0  
    p_values.append(p_value_chi2)
    p_values_chi2.append(p_value_chi2)
    previsto_chi2.append(predicted_label_chi2)
  
    # Apply the absolute mean deviation
    MAD_stat, p_value_MAD=teste_hipoteses("mad",first_digit_frequencies,colunas)
    p_value_MAD=1e-15 if p_value_MAD <(1e-15) else p_value_MAD
    predicted_label_MAD = 1 if p_value_MAD < alpha else 0
    p_values.append(p_value_MAD)
    p_values_MAD.append(p_value_MAD)
    previsto_MAD.append(predicted_label_MAD)
    

    # Apply to the distance from Kolmogorov-smirnov 
    ks_stat, p_value_ks=teste_hipoteses("ks",first_digit_frequencies,colunas)
    p_value_ks=1e-15 if p_value_ks <(1e-15) else p_value_ks
    predicted_label_ks = 1 if p_value_ks < alpha else 0
    p_values.append(p_value_ks)
    p_values_ks.append(p_value_ks)
    previsto_ks.append(predicted_label_ks)

    # Apply the euclidean distance
    euc_stat, p_value_euc=teste_hipoteses("euc",first_digit_frequencies,colunas)
    p_value_euc=1e-15 if p_value_euc <(1e-15) else p_value_euc
    predicted_label_euc = 1 if p_value_euc < alpha else 0
    p_values.append(p_value_euc)
    p_values_euc.append(p_value_euc)
    previsto_euc.append(predicted_label_euc)

     # Apply the distance of Hellinger
    hel_stat, p_value_hel=teste_hipoteses("hel",first_digit_frequencies,colunas)
    p_value_hel=1e-15 if p_value_hel <(1e-15) else p_value_hel
    predicted_label_hel = 1 if p_value_hel < alpha else 0
    p_values.append(p_value_hel)
    p_values_hel.append(p_value_hel)
    previsto_hel.append(predicted_label_hel)

    # Apply the divergence of Kulback-Leibler
    kl_stat, p_value_kl=teste_hipoteses("kl",first_digit_frequencies,colunas)
    p_value_kl=1e-15 if p_value_kl <(1e-15) else p_value_kl
    predicted_label_kl = 1 if p_value_kl < alpha else 0
    p_values.append(p_value_kl)
    p_values_kl.append(p_value_kl)
    previsto_kl.append(predicted_label_kl)


    # Fisher’s combination
    p_value_fisher = Fisher(p_values) 
    predicted_label=1 if p_value_fisher<alpha else 0
    previsto_f.append(predicted_label)
    


In [None]:
# List of metrics
metricas = ["Chi-square", "mean absolute deviation","Kolmogorov-Smirnov", "Euclidean", "Hellinger", "Kullback-Leibler", "Fisher"] 
tns, fps, fns, tps = [], [], [], []


previstos = [previsto_chi2, previsto_MAD, previsto_ks, previsto_euc,previsto_hel, previsto_kl, previsto_f]

for previsao in previstos:
    tn, fp, fn, tp, mc = matriz_confusao(class_real, previsao) 
    tns.append(tn)
    fps.append(fp)
    fns.append(fn)
    tps.append(tp)

# Create DataFrame Pandas
df = pd.DataFrame({
    "Metric": metricas,
    "TN": tns,
    "FP": fps,
    "FN": fns,
    "TP": tps
})

# Add evaluation metrics
df["Precision"] = df["TP"] / (df["TP"] + df["FP"])
df["Recall"] = df["TP"] / (df["TP"] + df["FN"])
df["F1-score"] = 2 * (df["Precision"] * df["Recall"]) / (df["Precision"] + df["Recall"])

# Display the table
print("Dataset:")
print(f"Number of features:{colunas}")
print(f"Number of instances:{linhas}")
print("  ")
print("Actual class:")
print(f"Positives --> with anomalies: {counts[1]}")
print(f"Negatives --> no anomalies: {counts[0]}")
print("  ")
print(f"Proportion of anomalies in an anomalous row: {prop_manipul_linha}")
print("-----------")
print(f"|Alpha={alpha}|")
print("-----------")
print(df)

# Save results in Excel format
df.to_excel(f"Performance.xlsx", index=False)

In [None]:

p_values = p_values_chi2
alpha_values = np.arange(0, 1, 0.001) 
p_values = np.array([p_values])

# statistics
print("Statistics of p_values:")
print("Average:", np.mean(p_values))
print("Standard Deviation:", np.std(p_values))
print("Min:", np.min(p_values))
print("Max:", np.max(p_values))

fprs, tprs, alpha_values = gerar_roc(class_real, p_values, alpha_values)

In [None]:
fprs = np.array(fprs)
tprs = np.array(tprs)
alpha_values = np.array(alpha_values)

# Criterion of Youden
youden_index = np.argmax(tprs - fprs)  
best_alpha_youden = alpha_values[youden_index]

# Point closest to (0,1)
distances = np.sqrt((1 - tprs)**2 + fprs**2)
best_alpha_distance = alpha_values[np.argmin(distances)]

print(f"Best cut-off point by Youden’s criterion: {best_alpha_youden}")
print(f"Best cut-off point by the criterion of least distance: {best_alpha_distance}")

