DATA ANALYSIS CODE
1) Initial data study

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Loading the dataset
CSV_PATH = "sff_optimal_1.csv"   
df = pd.read_csv(CSV_PATH)

bucket_col = df.columns[-1]
buckets = df[bucket_col].astype(int)
last_bucket = int(buckets.iloc[-1])
counts = buckets.value_counts().reindex([0,1,2,3], fill_value=0)
total_n = len(buckets)

# Plotting the histogram
plt.figure()
bars = plt.bar([0,1,2,3], counts.values)
plt.xticks([0,1,2,3], ["0","1","2","3"])
plt.xlabel("Bucket")
plt.ylabel("Frequency")
plt.title(f"Bucket Frequency (N={total_n})")


for rect, cnt in zip(bars, counts.values):
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2.0, height, str(int(cnt)),
             ha='center', va='bottom')

plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Loading the dataset and defining output file
# Mean usage < 1%, nonzero in <5% of samples
CSV_PATH = "sff_optimal_1.csv"               
SFF_PREFIX = "sff_"                 
ZERO_TOL = 1e-12                     
RARE_NONZERO_RATE = 0.05             
RARE_MEAN = 0.01                    
EXPORT_CSV = "sff_usage_metrics_final.csv"
df = pd.read_csv(CSV_PATH)

# Safety check if prefix not found
sff_cols = [c for c in df.columns if c.startswith(SFF_PREFIX)]
if not sff_cols:
    raise ValueError(f"No columns starting with '{SFF_PREFIX}' were found.")

SFF = df[sff_cols].copy()

# Calculation of metrics
n = len(SFF)
nonzero_rate = (SFF > ZERO_TOL).sum(axis=0) / n
mean_usage   = SFF.mean(axis=0)
median_usage = SFF.median(axis=0)
p95_usage    = SFF.quantile(0.95, axis=0)
cv_usage     = (SFF.std(axis=0) / (mean_usage.replace(0, np.nan))).fillna(np.nan)  # coefficient of variation
zero_share   = (SFF <= ZERO_TOL).sum(axis=0) / n

metrics = pd.DataFrame({
    "sff": sff_cols,
    "nonzero_rate": nonzero_rate.values,
    "mean": mean_usage.values,
    "median": median_usage.values,
    "p95": p95_usage.values,
    "cv": cv_usage.values,
    "zero_share": zero_share.values,
}).sort_values(["nonzero_rate", "mean"], ascending=[True, True]).reset_index(drop=True)

# Flag “underused” by thresholds
metrics["rare_flag"] = (
    (metrics["nonzero_rate"] < RARE_NONZERO_RATE) | (metrics["mean"] < RARE_MEAN)
)

print("=== Underused SFFs (by thresholds) ===")
print(metrics.loc[metrics["rare_flag"], ["sff", "nonzero_rate", "mean"]])

# Exporting
metrics.to_csv(EXPORT_CSV, index=False)
print(f"\nSaved metrics table to: {EXPORT_CSV}")

# Plotting
metrics_nr = metrics.sort_values("nonzero_rate", ascending=True)
plt.figure()
ypos = np.arange(len(metrics_nr))
bars = plt.barh(ypos, metrics_nr["nonzero_rate"].values)
plt.yticks(ypos, metrics_nr["sff"].values)
plt.xlabel("Nonzero rate")
plt.title("SFF Nonzero Rate (lower = used less often)")
for i, (rate, rare) in enumerate(zip(metrics_nr["nonzero_rate"].values, metrics_nr["rare_flag"].values)):
    if rare:
        plt.text(rate, i, "  ⭑", va='center')  # star mark
plt.tight_layout()
plt.show()


metrics_mean = metrics.sort_values("mean", ascending=True)
plt.figure()
ypos = np.arange(len(metrics_mean))
bars = plt.barh(ypos, metrics_mean["mean"].values)
plt.yticks(ypos, metrics_mean["sff"].values)
plt.xlabel("Mean usage (probability)")
plt.title("SFF Mean Usage (lower = used less)")
for i, (m, rare) in enumerate(zip(metrics_mean["mean"].values, metrics_mean["rare_flag"].values)):
    if rare:
        plt.text(m, i, "  ⭑", va='center')
plt.tight_layout()
plt.show()


K = 10  
bottom_cols = metrics_nr["sff"].head(K).tolist()
plt.figure()
plt.boxplot([SFF[c].values for c in bottom_cols], vert=False, labels=bottom_cols, showfliers=False)
plt.xlabel("Value")
plt.title(f"SFF Distributions — Bottom {K} by Nonzero Rate")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Loading the file
CSV_PATH = "sff_optimal_1.csv"
FEATURE_COLS = slice(0, 25)   
GRID_SHAPE = (5, 5)           
GRID_ORDER = "row-major"      
df = pd.read_csv(CSV_PATH)

# Metrics
X = df.iloc[:, FEATURE_COLS].astype(int).to_numpy()   # shape: (n_samples, 25)
n_samples = X.shape[0]

# counts and rates per position
counts = X.sum(axis=0)                # length 25
rates  = counts / n_samples

# reshape to grid
order = 'C' if GRID_ORDER == "row-major" else 'F'
grid_counts = counts.reshape(GRID_SHAPE, order=order)
grid_rates  = rates.reshape(GRID_SHAPE,  order=order)

print(f"Total samples: {n_samples}")
print("Counts per feature (0..24):")
print(counts)

# Plotting heatmap (occupancy rate)
plt.figure()
im = plt.imshow(grid_rates, vmin=0, vmax=1)
plt.title("Position occupancy rate (P(material=1))")
plt.xlabel("x")
plt.ylabel("y")
plt.colorbar(im, label="rate")
h, w = GRID_SHAPE
for i in range(h):
    for j in range(w):
        plt.text(j, i, f"{int(grid_counts[i,j])}\n({grid_rates[i,j]:.2f})",
                 ha='center', va='center')
plt.tight_layout()
plt.show()

# Plotting contour 
# build coordinate grid for contouring
yy, xx = np.mgrid[0:h, 0:w]
plt.figure()
levels = np.linspace(grid_rates.min(), grid_rates.max(), 7)
cs = plt.contourf(xx, yy, grid_rates, levels=levels)
plt.title("5×5 contour of occupancy rate")
plt.xlabel("x")
plt.ylabel("y")
plt.colorbar(cs, label="rate")
plt.tight_layout()
plt.show()


In [None]:
# Loading the dataset and rearranging into the grid for heatmaps
CSV_PATH = "sff_optimal_1.csv"         
FEATURE_COLS = slice(0, 25)   
BUCKET_IS_LAST_COL = True
BUCKET_COL = None            
GRID_SHAPE = (5, 5)
GRID_ORDER = "row-major"
df = pd.read_csv(CSV_PATH)

# Safety check
if BUCKET_COL is None and BUCKET_IS_LAST_COL:
    BUCKET_COL = df.columns[-1]

# Feature matrix
X = df.iloc[:, FEATURE_COLS].astype(int).to_numpy()
order = 'C' if GRID_ORDER == "row-major" else 'F'

# Plotting heatmap per class
classes = [0, 1, 2, 3]
for c in classes:
    subset = df[df[BUCKET_COL] == c]
    if len(subset) == 0:
        print(f"Class {c}: no samples, skipping.")
        continue

    rates = subset.iloc[:, FEATURE_COLS].mean().to_numpy()           
    grid = rates.reshape(GRID_SHAPE, order=order)

    counts = subset.iloc[:, FEATURE_COLS].sum().to_numpy()           
    grid_counts = counts.reshape(GRID_SHAPE, order=order)

    plt.figure()
    im = plt.imshow(grid, vmin=0, vmax=1)
    plt.title(f"Occupancy heatmap | class = {c} (N={len(subset)})")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.colorbar(im, label="P(material=1)")

    # annotate: count and rate
    h, w = GRID_SHAPE
    for i in range(h):
        for j in range(w):
            plt.text(j, i, f"{int(grid_counts[i,j])}\n({grid[i,j]:.2f})",
                     ha='center', va='center')

    plt.tight_layout()
    plt.show()


2) Feature engineering to check which features are most influential to target

In [None]:
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import chi2_contingency
# loading the dataset
df = pd.read_csv("sff_optimal_1.csv")
bits = df.iloc[:, 0:25].astype(int)
sff  = df.iloc[:, 25:25+26].astype(float)
y    = df.iloc[:, -1].astype(int)  # bucket 0..3
n    = len(df)

# Calculating the mutual information
X_bits = bits.values
X_sff  = sff.values
mi_bits = mutual_info_classif(X_bits, y, discrete_features=True, random_state=0)
mi_sff  = mutual_info_classif(X_sff,  y, discrete_features=False, random_state=0)
mi_bits_s = pd.Series(mi_bits, index=bits.columns).sort_values(ascending=False)
mi_sff_s  = pd.Series(mi_sff,  index=sff.columns).sort_values(ascending=False)

# χ² + Cramér’s V for bits
classes_present = sorted(pd.unique(y))  

def cramers_v_safe(table_values):
    """
    table_values: numpy array (r x c) of nonnegative counts.
    Returns (V, p_value) or (np.nan, np.nan) if undefined.
    """
    n = table_values.sum()
    if n == 0:
        return np.nan, np.nan

    # if any row or column sum is zero
    if (table_values.sum(axis=0) == 0).any() or (table_values.sum(axis=1) == 0).any():
        return np.nan, np.nan

    # degrees of freedom must be > 0
    r, c = table_values.shape
    if r < 2 or c < 2:
        return np.nan, np.nan

    chi2, p, dof, _ = chi2_contingency(table_values, correction=False)
    denom = n * (min(r - 1, c - 1))
    if denom <= 0:
        return np.nan, p
    V = np.sqrt(chi2 / denom)
    return V, p

rows = []
for col in bits.columns:
    # build contingency only over non zero classes
    tab = pd.crosstab(bits[col], y)
    tab = tab.reindex(index=[0, 1], columns=classes_present, fill_value=0)

    V, p = cramers_v_safe(tab.values)
    rows.append({
        "feature": col,
        "cramers_v": V,
        "p_value": p,
        "count_ones": int(bits[col].sum()),
        "count_zeros": int((bits[col] == 0).sum())
    })

cv_bits = pd.DataFrame(rows).sort_values("cramers_v", ascending=False).reset_index(drop=True)
print(cv_bits.head(10))

# Conditional probabilities for bits
def cond_probs_bit(col):
    sub1 = df[bits[col]==1][y.name].value_counts().reindex([0,1,2,3], fill_value=0)
    sub0 = df[bits[col]==0][y.name].value_counts().reindex([0,1,2,3], fill_value=0)
    p1 = (sub1 / max(1, sub1.sum())).values
    p0 = (sub0 / max(1, sub0.sum())).values
    return p1, p0


top_bits = mi_bits_s.head(10).index
diff_mat = []
for col in top_bits:
    p1, p0 = cond_probs_bit(col)
    diff_mat.append(p1 - p0)
diff_mat = np.vstack(diff_mat)  # 10x4

plt.figure()
plt.imshow(diff_mat, vmin=-1, vmax=1)
plt.yticks(range(len(top_bits)), top_bits)
plt.xticks(range(4), [0,1,2,3])
plt.title("ΔP(class | bit=1) vs bit=0 (top-10 MI bits)")
plt.colorbar(label="Probability shift")
plt.tight_layout(); plt.show()

# Class-conditional means for SFFs 
sff_means = sff.assign(cls=y).groupby("cls").mean().T  # rows=SFFs, cols=classes
# Plotting the top-10 class-discriminative SFFs by std across classes
std_across = sff_means.std(axis=1).sort_values(ascending=False)
pick = std_across.head(10).index
plt.figure()
plt.imshow(sff_means.loc[pick].values, aspect='auto')
plt.yticks(range(len(pick)), pick); plt.xticks(range(4), [0,1,2,3])
plt.title("E[SFF | class] for top-10 discriminative SFFs")
plt.colorbar(label="mean value"); plt.tight_layout(); plt.show()

# Volume fraction checks
vf = bits.sum(axis=1) / 25.0
plt.figure(); plt.hist(vf, bins=20); plt.title("Volume fraction (overall)"); plt.show()

plt.figure()
for c in [0,1,2,3]:
    plt.hist(vf[y==c], bins=20, alpha=0.6)
plt.title("Volume fraction by class"); plt.legend([0,1,2,3]); plt.show()


3) Correlation matrix of the features

In [None]:
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, leaves_list

# Load the dataset and running some checks 
CSV_PATH = "sff_optimal_1.csv"     
BITS = slice(0, 25)                        
SFF  = slice(25, 51)                       
ANNOTATE = False                          
ANNOTATE_THRESH = 0.5                      
FIGSIZE = (12, 10)
df = pd.read_csv(CSV_PATH)
bits = df.iloc[:, BITS].astype(float)      
sff  = df.iloc[:, SFF].astype(float)
allX = pd.concat([bits, sff], axis=1)      

# Calculating the correlation according to the column entry
corr_bits = bits.corr(method="pearson")     
corr_sff  = sff.corr(method="spearman")     
corr_all  = allX.corr(method="pearson")     

# CLustering the higher correlations together for better visualisation 
def repair_corr(R: pd.DataFrame) -> pd.DataFrame:
    """Set diag=1 and replace off-diagonal NaNs with 0 so plots/clustering don’t break."""
    R = R.copy()
    np.fill_diagonal(R.values, 1.0)
    return R.fillna(0.0)

def cluster_order(R: pd.DataFrame):
    """Average-linkage on distance = 1 - |r|. Returns index order."""
    R = repair_corr(R)
    D = 1.0 - np.abs(R.values)
    np.fill_diagonal(D, 0.0)
    # condensed distance vector
    dvec = squareform(D, checks=False)
    if not np.isfinite(dvec).all():
        raise ValueError("Non-finite values in distance; check for NaNs/inf in correlation.")
    Z = linkage(dvec, method="average")
    return leaves_list(Z)

# Plotting the correlation matrix
def plot_corr(R: pd.DataFrame, title: str, outfile: str = None,
              annotate: bool = False, thresh: float | None = None,
              figsize=(10, 8)):
    R = repair_corr(R)
    plt.figure(figsize=figsize)
    im = plt.imshow(R.values, vmin=-1, vmax=1)
    plt.title(title)
    plt.colorbar(im, label="correlation")
    plt.xticks(range(len(R.columns)), R.columns, rotation=90)
    plt.yticks(range(len(R.index)),   R.index)

    if annotate:
        h, w = R.shape
        for i in range(h):
            for j in range(w):
                val = R.iat[i, j]
                if thresh is not None and abs(val) < thresh:
                    continue
                plt.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=6)

    plt.tight_layout()
    if outfile:
        plt.savefig(outfile, dpi=300)
    plt.show()

def plot_corr_clustered(R: pd.DataFrame, title: str, outfile: str = None,
                        annotate: bool = False, thresh: float | None = None,
                        figsize=(10, 8)):
    order = cluster_order(R)
    R_ord = R.iloc[order, :].iloc[:, order]
    plot_corr(R_ord, title + " (clustered)", outfile, annotate, thresh, figsize)

# Plotting raw features only CM, SFF only CM and all together CM
plot_corr(corr_bits, "Bits-only correlation (phi)", "corr_bits_25x25.png",
          annotate=ANNOTATE, thresh=ANNOTATE_THRESH, figsize=FIGSIZE)
plot_corr(corr_sff,  "SFF-only correlation (Spearman)", "corr_sff_26x26.png",
          annotate=ANNOTATE, thresh=ANNOTATE_THRESH, figsize=FIGSIZE)
plot_corr(corr_all,  "All features correlation (Pearson)", "corr_all_51x51.png",
          annotate=True, figsize=FIGSIZE)

# Clustered versions
plot_corr_clustered(corr_bits, "Bits correlation", "corr_bits_clustered.png",
                    annotate=False, figsize=FIGSIZE)
plot_corr_clustered(corr_sff,  "SFF correlation",  "corr_sff_clustered.png",
                    annotate=False, figsize=FIGSIZE)
plot_corr_clustered(corr_all,  "All features correlation", "corr_all_clustered.png",
                    annotate=False, figsize=FIGSIZE)

# Saving matrices for inspection in Excel 
repair_corr(corr_bits).to_csv("corr_bits_25x25.csv")
repair_corr(corr_sff).to_csv("corr_sff_26x26.csv")
repair_corr(corr_all).to_csv("corr_all_51x51.csv")
print("Saved PNGs and CSVs.")


4) Removal of the fully correlated features

In [None]:
import pandas as pd

# load your data
df = pd.read_csv("sff_optimal_1.csv")

def drop_and_report(df, colname):
    if colname in df.columns:
        df = df.drop(columns=[colname])
        print(f"Removed column: {colname}")
    else:
        print(f"Column not found: {colname}")
    return df

# removal of complete correlated features
related_columns = ["sff_C_2x3_up_open", "sff_C_3x2_right_open", "sff_L_2x2_down_right", "sff_L_2x2_down_left", "sff_L_2x2_up_left", "sff_T_3x3_left", "sff_T_3x3_up"]
for cName in related_columns:
    df = drop_and_report(df, cName)

print(df.shape)


In [None]:
rare_columns=["sff_cross_3x3","sff_diag_main_3x3","sff_diag_anti_3x3","sff_checker_2x2","sff_plus_3x3","sff_T_3x3_right","sff_T_3x3_down"]
for cName in rare_columns:
    df = drop_and_report(df, cName)

print(df.shape)

5) Safecheck and feature engineering for edited dataset (also useful for presentation)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif

# Loading the slices of raw geometry bits and shape frequency features
BITS_SLICE = slice(0, 25)      # 25 binary columns
SFF_SLICE  = slice(25, 51)     # 26 SFF probability columns
TARGET_COL = df.columns[-1]    # last column is bucket (0..3)

# Preparing blocks
bits = df.iloc[:, BITS_SLICE].astype(int)
sff  = df.iloc[:, SFF_SLICE].astype(float)
y    = df[TARGET_COL].astype(int).values

# safety against constant columns (MI will be 0 for those)
bits_var0 = bits.columns[(bits.var() == 0).values].tolist()
sff_var0  = sff.columns[(sff.var() == 0).values].tolist()
if bits_var0 or sff_var0:
    print("Warning: constant columns (MI will be 0):")
    if bits_var0: print("  Bits:", bits_var0)
    if sff_var0:  print("  SFF :", sff_var0)

# Mutual Information
mi_bits = mutual_info_classif(bits.values, y, discrete_features=True,  random_state=0)
mi_sff  = mutual_info_classif(sff.values,  y, discrete_features=False, random_state=0)

mi_bits_s = pd.Series(mi_bits, index=bits.columns, name="mi")
mi_sff_s  = pd.Series(mi_sff,  index=sff.columns,  name="mi")

mi_all = (
    pd.concat([mi_bits_s, mi_sff_s])
      .to_frame()
      .assign(block=lambda d: np.where(d.index.isin(bits.columns), "bits", "sff"))
      .sort_values("mi", ascending=False)
      .reset_index(names="feature")
)

# Top-10 and Bottom-10
top10    = mi_all.head(10).copy()
bottom10 = mi_all.tail(10).sort_values("mi", ascending=True).copy()

print("\nTop-10 most discriminative (by mutual information):")
print(top10.to_string(index=False))

print("\nBottom-10 least discriminative (by mutual information):")
print(bottom10.to_string(index=False))

# bar plots
plt.figure(figsize=(8,4))
plt.barh(top10["feature"], top10["mi"])
plt.gca().invert_yaxis()
plt.title("Top-10 MI features")
plt.xlabel("Mutual Information")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,4))
plt.barh(bottom10["feature"], bottom10["mi"])
plt.gca().invert_yaxis()
plt.title("Bottom-10 MI features")
plt.xlabel("Mutual Information")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import chi2_contingency


# Loading the dataset

CSV_PATH = "sff_with_pca_optimal.csv"

bits = df.iloc[:, 0:25].astype(int)
sff  = df.iloc[:, 25:25+26].astype(float)
y    = df.iloc[:, -1].astype(int)
n    = len(df)


# Mutual Information (MI)

mi_bits = mutual_info_classif(bits.values, y, discrete_features=True,  random_state=0)
mi_sff  = mutual_info_classif(sff.values,  y, discrete_features=False, random_state=0)

mi_bits_s = pd.Series(mi_bits, index=bits.columns, name="mi")
mi_sff_s  = pd.Series(mi_sff,  index=sff.columns,  name="mi")

# Top/bottom by MI 
mi_all = (
    pd.concat([mi_bits_s, mi_sff_s])
      .to_frame()
      .assign(block=lambda d: np.where(d.index.isin(bits.columns), "bits", "sff"))
      .sort_values("mi", ascending=False)
      .reset_index(names="feature")
)
print("\nTop-10 (ALL) by MI")
print(mi_all.head(10).to_string(index=False))
print("\nBottom-10 (ALL) by MI")
print(mi_all.tail(10).sort_values("mi").to_string(index=False))


# χ² + Cramér’s V (for bits only)
classes_present = sorted(pd.unique(y))

def cramers_v_safe(table_values: np.ndarray):
    n_obs = table_values.sum()
    if n_obs == 0:
        return np.nan, np.nan
    if (table_values.sum(axis=0) == 0).any() or (table_values.sum(axis=1) == 0).any():
        return np.nan, np.nan
    r, c = table_values.shape
    if r < 2 or c < 2:
        return np.nan, np.nan
    chi2, p, dof, _ = chi2_contingency(table_values, correction=False)
    denom = n_obs * (min(r - 1, c - 1))
    if denom <= 0:
        return np.nan, p
    V = np.sqrt(chi2 / denom)
    return V, p

rows = []
for col in bits.columns:
    tab = pd.crosstab(bits[col], y)
    tab = tab.reindex(index=[0, 1], columns=classes_present, fill_value=0)
    V, p = cramers_v_safe(tab.values)
    rows.append({
        "feature": col,
        "cramers_v": V,
        "p_value": p,
        "count_ones": int(bits[col].sum()),
        "count_zeros": int((bits[col] == 0).sum())
    })

cv_bits = pd.DataFrame(rows)
cv_top10    = cv_bits.sort_values("cramers_v", ascending=False, na_position="last").head(10)
cv_bottom10 = cv_bits.sort_values("cramers_v", ascending=True,  na_position="last").head(10)

print("\nTop-10 bits by Cramér’s V")
print(cv_top10.to_string(index=False))
print("\nBottom-10 bits by Cramér’s V")
print(cv_bottom10.to_string(index=False))

# ΔP(class | bit=1) - ΔP(class | bit=0)
# for top-10 and bottom-10 MI bits
def cond_probs_bit(col):
    sub1 = df[bits[col] == 1][y.name].value_counts().reindex([0,1,2,3], fill_value=0)
    sub0 = df[bits[col] == 0][y.name].value_counts().reindex([0,1,2,3], fill_value=0)
    p1 = (sub1 / max(1, sub1.sum())).values
    p0 = (sub0 / max(1, sub0.sum())).values
    return p1, p0

top_bits_mi    = mi_bits_s.sort_values(ascending=False).head(10).index
bottom_bits_mi = mi_bits_s.sort_values(ascending=True).head(10).index

def plot_deltaP(bit_list, title):
    mat = []
    for col in bit_list:
        p1, p0 = cond_probs_bit(col)
        mat.append(p1 - p0)
    mat = np.vstack(mat)
    plt.figure(figsize=(6,6))
    plt.imshow(mat, vmin=-1, vmax=1)
    plt.yticks(range(len(bit_list)), bit_list)
    plt.xticks(range(4), [0,1,2,3])
    plt.title(title)
    plt.colorbar(label="Probability shift")
    plt.tight_layout()
    plt.show()

plot_deltaP(top_bits_mi,    "ΔP(class | bit=1) vs bit=0 — TOP-10 MI bits")
plot_deltaP(bottom_bits_mi, "ΔP(class | bit=1) vs bit=0 — BOTTOM-10 MI bits")


# SFF class-conditional means:
# TOP-10 & BOTTOM-10 by std across classes

sff_means = sff.assign(cls=y).groupby("cls").mean().T  # rows=SFFs, cols=classes
std_across = sff_means.std(axis=1)

top10_sff    = std_across.sort_values(ascending=False).head(10).index
bottom10_sff = std_across.sort_values(ascending=True).head(10).index

def plot_sff_means(rows, title):
    vals = sff_means.loc[rows].values
    plt.figure(figsize=(7,5))
    plt.imshow(vals, aspect='auto')
    plt.yticks(range(len(rows)), rows)
    plt.xticks(range(4), [0,1,2,3])
    plt.title(title)
    plt.colorbar(label="E[SFF | class]")
    plt.tight_layout()
    plt.show()

plot_sff_means(top10_sff,    "E[SFF | class] — TOP-10 discriminative SFFs (by std)")
plot_sff_means(bottom10_sff, "E[SFF | class] — BOTTOM-10 least discriminative SFFs (by std)")


# Volume fraction 
vf = bits.sum(axis=1) / 25.0

plt.figure()
plt.hist(vf, bins=20)
plt.title("Volume fraction (overall)")
plt.xlabel("VF"); plt.ylabel("count")
plt.tight_layout(); plt.show()

plt.figure()
for c in [0,1,2,3]:
    plt.hist(vf[y == c], bins=20, alpha=0.6, label=str(c))
plt.title("Volume fraction by class")
plt.xlabel("VF"); plt.ylabel("count")
plt.legend(title="class")
plt.tight_layout(); plt.show()


6) Dropped more features as per the importance

In [None]:
MI0_columns = ["sff_C_2x3_down_open", "sff_C_3x2_left_open"]
for cName in MI0_columns:
    df = drop_and_report(df, cName)

print(df.shape)


In [None]:

out_path = "feature_engineered_data.csv"
df.to_csv(out_path, index=False)
print(f"Saved to {out_path}")
