In [None]:
pip install --user scikit-learn

# Sampling experiments

# Undersampling

In [1]:
import os
import pickle
import networkx as nx
import numpy as np
import random 

graph_folder = r'C:\Users\user\Desktop\H01\DATA\graphs_P1'

flare_graphs = []
nonflare_graphs = []

for filename in os.listdir(graph_folder):
    graph_path = os.path.join(graph_folder, filename)
    try:
        with open(graph_path, 'rb') as f:
            G = pickle.load(f)
        
        # Compute node degrees
        degrees = [deg for node, deg in G.degree() if node != len(G.nodes) - 1]
        
        if filename.startswith('flare'):
            label = 1
            flare_graphs.append((degrees, label))
        elif filename.startswith('nonflare'):
            label = 0
            nonflare_graphs.append((degrees, label))
    except Exception as e:
        print(f"Error processing file {filename}: {str(e)}")

random_nonflare_graphs = random.sample(nonflare_graphs, 1180)
undersampled_graphs = random_nonflare_graphs + flare_graphs

# Shuffle the balanced dataset
random.shuffle(undersampled_graphs)
degree_matrix = [degree for degree, label in undersampled_graphs]
label_vector = [label for degree, label in undersampled_graphs]

# Convert lists to NumPy arrays
degree_matrix = np.array(degree_matrix)
label_vector = np.array(label_vector)

# Print or further process the graph_matrix and label_vector as needed
print("Degree Matrix Shape:", degree_matrix.shape)
print("Label Vector Shape:", label_vector.shape)

Degree Matrix Shape: (2360, 23)
Label Vector Shape: (2360,)


# Train SVM

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score

# Create and train an SVM classifier
svm_classifier = SVC(class_weight='balanced')
svm_classifier.fit(degree_matrix, label_vector)

# Make predictions on the test data
y_pred = svm_classifier.predict(degree_matrix)  

# Calculate the confusion matrix
conf_matrix = confusion_matrix(label_vector, y_pred)

# Calculate True Positives, True Negatives, False Positives, and False Negatives
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate Accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# Calculate True Skill Statistics (TSS)
tss = (tp / (tp + fn)) - (fp / (fp + tn))

# Calculate Heidke Skill Score 1 (HSS1)
hss1 = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))

# Calculate Heidke Skill Score 2 (HSS2)
hss2 = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (tn + fn) + (tp + fp) * (tn + fp))

# Calculate F1 Score
f1 = f1_score(label_vector, y_pred)

# Calculate Gilbert Skill Score
gilbert = (tp - (tp + fn) * (tp + fp) / (tp + tn + fp + fn)) / (tp + tn - (tp + fn) * (tp + fp) / (tp + tn + fp + fn))

# Calculate ROC AUC score
roc_auc = roc_auc_score(label_vector, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("True Skill Statistics (TSS):", tss)
print("Heidke Skill Score 1 (HSS1):", hss1)
print("Heidke Skill Score 2 (HSS2):", hss2)
print("F1 Score:", f1)
print("Gilbert Skill Score:", gilbert)
print("ROC AUC Score:", roc_auc)


Accuracy: 0.8483050847457627
True Skill Statistics (TSS): 0.6966101694915254
Heidke Skill Score 1 (HSS1): 0.6966101694915254
Heidke Skill Score 2 (HSS2): 0.6966101694915254
F1 Score: 0.8556451612903225
Gilbert Skill Score: 0.3039940828402367
ROC AUC Score: 0.8483050847457627


In [3]:
import pickle
import os

# Define the directory where you want to save the model
model_directory = r'C:\Users\user\Desktop\H01\MODELS'

# Create the directory if it doesn't exist
os.makedirs(model_directory, exist_ok=True)

# Save the trained SVM classifier model to a file in the specified directory
model_filename = os.path.join(model_directory, 'svm_classifier_model_undersampled.pkl')
with open(model_filename, 'wb') as model_file:
    pickle.dump(svm_classifier, model_file)

print(f"Saved SVM classifier model to {model_filename}")

Saved SVM classifier model to C:\Users\user\Desktop\H01\MODELS\svm_classifier_model_undersampled.pkl


# Train RF

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score


rf_classifier = RandomForestClassifier(class_weight='balanced')
rf_classifier.fit(degree_matrix, label_vector)

# Make predictions on the test data
y_pred = rf_classifier.predict(degree_matrix)  

# Calculate the confusion matrix
conf_matrix = confusion_matrix(label_vector, y_pred)

# Calculate True Positives, True Negatives, False Positives, and False Negatives
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate Accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# Calculate True Skill Statistics (TSS)
tss = (tp / (tp + fn)) - (fp / (fp + tn))

# Calculate Heidke Skill Score 1 (HSS1)
hss1 = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))

# Calculate Heidke Skill Score 2 (HSS2)
hss2 = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (tn + fn) + (tp + fp) * (tn + fp))

# Calculate F1 Score
f1 = f1_score(label_vector, y_pred)

# Calculate Gilbert Skill Score
gilbert = (tp - (tp + fn) * (tp + fp) / (tp + tn + fp + fn)) / (tp + tn - (tp + fn) * (tp + fp) / (tp + tn + fp + fn))

# Calculate ROC AUC score
roc_auc = roc_auc_score(label_vector, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("True Skill Statistics (TSS):", tss)
print("Heidke Skill Score 1 (HSS1):", hss1)
print("Heidke Skill Score 2 (HSS2):", hss2)
print("F1 Score:", f1)
print("Gilbert Skill Score:", gilbert)
print("ROC AUC Score:", roc_auc)

Accuracy: 1.0
True Skill Statistics (TSS): 1.0
Heidke Skill Score 1 (HSS1): 1.0
Heidke Skill Score 2 (HSS2): 1.0
F1 Score: 1.0
Gilbert Skill Score: 0.3333333333333333
ROC AUC Score: 1.0


In [5]:
import pickle
import os

model_directory = r'C:\Users\user\Desktop\H01\MODELS'
os.makedirs(model_directory, exist_ok=True)

# Save the trained Random Forest classifier model to a file in the specified directory
model_filename = os.path.join(model_directory, 'rf_classifier_model_undersampled.pkl')
with open(model_filename, 'wb') as model_file:
    pickle.dump(rf_classifier, model_file)

print(f"Saved Random Forest classifier model to {model_filename}")

Saved Random Forest classifier model to C:\Users\user\Desktop\H01\MODELS\rf_classifier_model_undersampled.pkl


# Test Results

In [6]:
import os
import pickle
import networkx as nx
import numpy as np

graph_folder = r'C:\Users\user\Desktop\H01\DATA\graphs_P2'
graph_vectors = []
labels = []

# Iterate through all files in the folder
for filename in os.listdir(graph_folder):
    graph_path = os.path.join(graph_folder, filename)
    try:
        with open(graph_path, 'rb') as f:
            G = pickle.load(f)
        
        # Compute node degrees
        degrees = [deg for node, deg in G.degree() if node != len(G.nodes) - 1]
        
        # Determine label based on filename
        if filename.startswith('flare'):
            label = 1
        elif filename.startswith('nonflare'):
            label = 0
            
        # Append degrees and label to lists
        graph_vectors.append(degrees)
        labels.append(label)
    
    except Exception as e:
        print(f"Error processing file {filename}: {str(e)}")

# Convert lists to NumPy arrays
graph_matrix = np.array(graph_vectors)
label_vector = np.array(labels)

# Print or further process the graph_matrix and label_vector as needed
print("Graph Matrix Shape:", graph_matrix.shape)
print("Label Vector Shape:", label_vector.shape)

Graph Matrix Shape: (79541, 23)
Label Vector Shape: (79541,)


# Test SVM

In [14]:
import os
import pickle
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
import numpy as np

# Load the saved SVM model
svm_model_path = r'C:\Users\user\Desktop\H01\MODELS\svm_classifier_model_undersampled.pkl'
with open(svm_model_path, 'rb') as svm_model_file:
    svm_classifier = pickle.load(svm_model_file)

# Make predictions using the loaded models
svm_predictions = svm_classifier.predict(graph_matrix)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(label_vector, svm_predictions)

# Calculate True Positives, True Negatives, False Positives, and False Negatives
tn, fp, fn, tp = conf_matrix.ravel()
print("tn, fp, fn, tp: ", tn, fp, fn, tp)
# Calculate Accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# Calculate True Skill Statistics (TSS)
tss = (tp / (tp + fn)) - (fp / (fp + tn))

# Calculate Heidke Skill Score 1 (HSS1)
hss1 = (tp/(tp+fn))*(2 - (tp + fp)/tp )

# Calculate Heidke Skill Score 2 (HSS2)
hss2 =  (2 * (tp * tn - fp * fn)) / ((tp + fn) * (tn + fn) + (tp + fp) * (tn + fp))

# Calculate F1 Score
f1 = f1_score(label_vector, svm_predictions)

# Calculate Gilbert Skill Score
gilbert = (tp - (tp + fn) * (tp + fp) / (tp + tn + fp + fn)) / (tp + tn - (tp + fn) * (tp + fp) / (tp + tn + fp + fn))

# Calculate ROC AUC score
roc_auc = roc_auc_score(label_vector, svm_predictions)

# Print the results
print("SVM Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("True Skill Statistics (TSS):", tss)
print("Heidke Skill Score 1 (HSS1):", hss1)
print("Heidke Skill Score 2 (HSS2):", hss2)
print("F1 Score:", f1)
print("Gilbert Skill Score:", gilbert)
print("ROC AUC Score:", roc_auc)

tn, fp, fn, tp:  54385 23871 496 789
SVM Model Evaluation Metrics:
Accuracy: 0.6936548446713016
True Skill Statistics (TSS): 0.30897046866833155
Heidke Skill Score 1 (HSS1): -17.962645914396887
Heidke Skill Score 2 (HSS2): 0.031064855586285908
F1 Score: 0.06082096743110425
Gilbert Skill Score: 0.00713114802715343
ROC AUC Score: 0.6544852343341657


# Test RF

In [15]:
# Load the saved Random Forest model
rf_model_path = r'C:\Users\user\Desktop\H01\MODELS\rf_classifier_model_undersampled.pkl'
with open(rf_model_path, 'rb') as rf_model_file:
    rf_classifier = pickle.load(rf_model_file)
    
rf_predictions = rf_classifier.predict(graph_matrix)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(label_vector, rf_predictions)

# Calculate True Positives, True Negatives, False Positives, and False Negatives
tn, fp, fn, tp = conf_matrix.ravel()
print("tn, fp, fn, tp: ", tn, fp, fn, tp)

# Calculate Accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# Calculate True Skill Statistics (TSS)
tss = (tp / (tp + fn)) - (fp / (fp + tn))

# Calculate Heidke Skill Score 1 (HSS1)
hss1 = (tp/(tp+fn))*(2 - (tp + fp)/tp )

# Calculate Heidke Skill Score 2 (HSS2)
hss2 = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (tn + fn) + (tp + fp) * (tn + fp))

# Calculate F1 Score
f1 = f1_score(label_vector, rf_predictions)

# Calculate Gilbert Skill Score
gilbert = (tp - (tp + fn) * (tp + fp) / (tp + tn + fp + fn)) / (tp + tn - (tp + fn) * (tp + fp) / (tp + tn + fp + fn))

# Calculate ROC AUC score
roc_auc = roc_auc_score(label_vector, rf_predictions)

# Print the results
print("Random Forest Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("True Skill Statistics (TSS):", tss)
print("Heidke Skill Score 1 (HSS1):", hss1)
print("Heidke Skill Score 2 (HSS2):", hss2)
print("F1 Score:", f1)
print("Gilbert Skill Score:", gilbert)
print("ROC AUC Score:", roc_auc)

tn, fp, fn, tp:  58590 19666 575 710
Random Forest Model Evaluation Metrics:
Accuracy: 0.7455274638236884
True Skill Statistics (TSS): 0.30122576844470156
Heidke Skill Score 1 (HSS1): -14.751750972762647
Heidke Skill Score 2 (HSS2): 0.03626418141447746
F1 Score: 0.06555560685102259
Gilbert Skill Score: 0.006457801136637128
ROC AUC Score: 0.6506128842223508


# Oversampling