In [1]:
#This notebook is here to calculate the confusion matrix from scratch to understand the work ability of our model

#importing useful python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix#,confusion_matrix_at_thresholds
#from sklearn.metrics import confusion_matrix_at_thresholds

In [2]:
#loading dataset
df = pd.read_parquet("subset of baseline with samples 50neg & pos.parquet")

In [3]:
#checking if loaded properly:column check
df.keys()


Index(['object', 'mean', 'weighted_mean', 'standard_deviation', 'median',
       'amplitude', 'beyond_1_std', 'cusum', 'inter_percentile_range_10',
       'kurtosis', 'linear_trend', 'linear_trend_sigma', 'linear_trend_noise',
       'linear_fit_slope', 'linear_fit_slope_sigma', 'linear_fit_reduced_chi2',
       'magnitude_percentage_ratio_40_5', 'magnitude_percentage_ratio_20_10',
       'maximum_slope', 'median_absolute_deviation',
       'median_buffer_range_percentage_10', 'percent_amplitude',
       'mean_variance', 'anderson_darling_normal', 'chi2', 'skew', 'stetson_K',
       'cluster', 'cluster_id'],
      dtype='object')

In [4]:
#elements check
df.head(20)

Unnamed: 0,object,mean,weighted_mean,standard_deviation,median,amplitude,beyond_1_std,cusum,inter_percentile_range_10,kurtosis,...,median_absolute_deviation,median_buffer_range_percentage_10,percent_amplitude,mean_variance,anderson_darling_normal,chi2,skew,stetson_K,cluster,cluster_id
0,ZTF18aayuzaj,13.688346,13.685163,0.358838,13.631022,1.704284,0.301001,0.170863,0.969517,0.810235,...,0.210744,0.437644,2.291209,0.026215,20.932565,520.519968,0.432772,0.768013,1.0,1.0
1,ZTF18abhbhwd,17.320197,17.24375,0.278426,17.26683,1.103769,0.292739,0.219918,0.787825,0.196193,...,0.147615,0.319247,1.45116,0.016075,93.740888,130.468214,0.865308,0.804507,1.0,1.0
2,ZTF18aaxcyth,15.436886,15.43302,0.130212,15.435888,0.826002,0.329772,0.122198,0.333758,3.247273,...,0.088842,0.473102,1.279842,0.008435,1.824835,63.276991,0.411712,0.786535,1.0,1.0
3,ZTF18aazdcyl,15.689254,15.684933,0.135519,15.741753,0.621263,0.24016,0.271068,0.361851,0.922049,...,0.05164,0.533942,0.680359,0.008638,75.146382,69.269782,-0.887205,0.778961,1.0,1.0
4,ZTF18abebizu,15.751828,15.722266,0.347854,15.844798,1.027375,0.53224,0.405152,0.858655,-1.434871,...,0.288947,0.160656,1.338771,0.022083,97.552446,475.655417,-0.197862,0.916768,1.0,1.0
5,ZTF18abeboej,16.873177,16.85304,0.218995,16.830446,0.604088,0.241023,0.239213,0.563981,1.042065,...,0.07024,0.45593,0.687952,0.012979,85.955018,94.008855,0.827206,0.686488,1.0,1.0
6,ZTF18ablngjt,15.562575,15.556951,0.229989,15.59374,0.49676,0.24972,0.289331,0.619872,-0.682549,...,0.165377,0.180291,0.579692,0.014778,74.573044,192.966851,-0.70074,0.834077,1.0,1.0
7,ZTF18aawkwln,15.6352,15.635153,0.055664,15.633553,0.410186,0.296514,0.246065,0.134338,4.446734,...,0.037347,0.536417,0.445989,0.00356,7.117692,13.929589,0.139707,0.772886,1.0,1.0
8,J0614-2725,18.76502,18.543927,0.398182,18.761408,1.631694,0.279602,0.119857,0.935667,2.276925,...,0.242128,0.353234,2.082767,0.021219,2.637823,31.199294,0.604146,0.809469,,
9,ZTF18adkcxpf,14.63197,14.629926,0.133584,14.63008,0.741823,0.257652,0.140158,0.311347,5.767534,...,0.067157,0.520868,1.136209,0.00913,30.313342,109.092012,1.047373,0.683152,1.0,1.0


In [5]:
#stepping forward with useful numerical data only
X = df.drop(columns=['object','cluster', 'cluster_id']).values

In [6]:
# object column contains names like 'ZTF18abc...' or non-ZTF names : Binary conversion of objects
y = np.where(df['object'].str.contains('ZTF', case=False),
             -1,   # ZTF → negative
              1)   # non-ZTF → positive

In [7]:
#runthrough
X

array([[ 1.36883456e+01,  1.36851629e+01,  3.58837566e-01, ...,
         5.20519968e+02,  4.32772256e-01,  7.68013461e-01],
       [ 1.73201972e+01,  1.72437497e+01,  2.78425993e-01, ...,
         1.30468214e+02,  8.65308001e-01,  8.04507225e-01],
       [ 1.54368865e+01,  1.54330198e+01,  1.30212161e-01, ...,
         6.32769913e+01,  4.11711919e-01,  7.86535165e-01],
       ...,
       [ 1.80104926e+01,  1.72123293e+01,  1.23640135e+00, ...,
         4.53392793e+02,  8.65250910e-01,  8.86083625e-01],
       [ 1.70137729e+01,  1.69835731e+01,  2.21157098e-01, ...,
         9.81349013e+01,  1.31091092e+00,  6.70645606e-01],
       [ 1.63111787e+01,  1.63002903e+01,  1.86079931e-01, ...,
         1.17600481e+02, -7.29353574e-01,  8.29970429e-01]])

In [8]:
#runthrough
y

array([-1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1, -1,
       -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
        1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
       -1, -1, -1, -1, -1

In [9]:
#run accuracy
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
)

final_model= RandomForestClassifier(
    n_estimators=60,
    class_weight="balanced",
   
)

final_model.fit(X_train, y_train)

test_accuracy = final_model.score(X_test, y_test)
print("\nFinal test accuracy:", test_accuracy)



Final test accuracy: 0.9125


In [10]:
#measuring robustness through 5-fold cross-validation.
scores = cross_val_score(
    clf,
    X,
    y,
    cv=5
)

print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())
print("Std CV accuracy:", scores.std())


NameError: name 'clf' is not defined

In [None]:
# Save training set
df_train = pd.DataFrame(X_train)
df_train['y'] = y_train
df_train.to_csv("training_set.csv", index=False)

# Save test set
df_test = pd.DataFrame(X_test)
df_test['y'] = y_test
df_test.to_csv("test_set.csv", index=False)

In [None]:
#Train the Random Forest classifier
clf = RandomForestClassifier(
    n_estimators=60,      # number of trees in my forest
    #random_state=None,     # different forest each run
)

clf.fit(X_train, y_train)


In [None]:
# Check class probabilities for the test set.
# predict_proba returns an array of shape (N_test, 2),
# where N_test is the number of test objects.
# Column 0 → probability of class -1 (negative, ZTF)
# Column 1 → probability of class +1 (positive, non-ZTF)
probs = clf.predict_proba(X_test)
print('probs',probs)

In [None]:
# CONFUSION MATRIX + TP / TN / FP / FN IDENTIFICATION
# Class labels for clarity
class_names = ["ZTF (non-CV)", "non-ZTF (CV)"]

# Plot confusion matrix (raw counts)
disp = ConfusionMatrixDisplay.from_estimator(
    clf,
    X_test,
    y_test,
    display_labels=class_names,
    cmap=plt.cm.Blues #heatmap
)

disp.ax_.set_title("Confusion Matrix (Raw Counts)")
print("Confusion matrix (raw counts):")
print(disp.confusion_matrix)

plt.show()


# Extract TN, FP, FN, TP explicitly
# Label convention used in YOUR code:
#   -1 → ZTF (negative, non-CV)
#   +1 → non-ZTF (positive, CV)

cm = confusion_matrix(y_test, clf.predict(X_test), labels=[-1, 1])

TN, FP, FN, TP = cm.ravel()




In [None]:
#this is for checking the class_weight part
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X,
    y,
    test_size=0.20,
)

final_model_1 = RandomForestClassifier(
    n_estimators=60,
    #class_weight="balanced",
   
)

final_model_1.fit(X_train_1, y_train_1)

test_accuracy = final_model_1.score(X_test_1, y_test_1)
print("\nFinal test accuracy:", test_accuracy)


In [None]:
# CONFUSION MATRIX + TP / TN / FP / FN IDENTIFICATION
# Class labels for clarity
class_names = ["ZTF (non-CV)", "non-ZTF (CV)"]

# Plot confusion matrix (raw counts)
disp = ConfusionMatrixDisplay.from_estimator(
    final_model_1,
    X_test_1,
    y_test_1,
    display_labels=class_names,
    cmap=plt.cm.Blues #heatmap
)

disp.ax_.set_title("Confusion Matrix (Raw Counts)")
print("Confusion matrix (raw counts):")
print(disp.confusion_matrix)

plt.show()


# Extract TN, FP, FN, TP explicitly
# Label convention used in YOUR code:
#   -1 → ZTF (negative, non-CV)
#   +1 → non-ZTF (positive, CV)

cm = confusion_matrix(y_test_1, clf.predict(X_test_1), labels=[-1, 1])

TN, FP, FN, TP = cm.ravel()


