In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from tabulate import tabulate  # Assuming you use tabulate for pretty printing

In [2]:
# Load data and perform initial preprocessing
df = pd.read_csv('house_prices_train.csv')
selected_columns = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 
                    'YearBuilt', 'YearRemodAdd', 'GarageCars', 'SalePrice']
df = df[selected_columns]

In [3]:
model = IsolationForest(n_estimators=100, max_samples=0.5,
                        contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None,
                        verbose=1, random_state=2020)
model.fit(df)

In [4]:
df

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,GarageCars,SalePrice
0,60,8450,7,5,2003,2003,2,208500
1,20,9600,6,8,1976,1976,2,181500
2,60,11250,7,5,2001,2002,2,223500
3,70,9550,7,5,1915,1970,3,140000
4,60,14260,8,5,2000,2000,3,250000
...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,2000,2,175000
1456,20,13175,6,6,1978,1988,2,210000
1457,70,9042,7,9,1941,2006,1,266500
1458,20,9717,5,6,1950,1996,1,142125


In [5]:
predictions = model.predict(df)
anomaly_indices = np.where(predictions == -1)[0]

### Counter Examples

In [79]:
selected_anomaly_index = 2
selected_anomaly = df.iloc[selected_anomaly_index]
num_counterexamples = 5
counterexample_indices = []

In [80]:

df_values = df.values.astype(float) 
selected_anomaly_values = selected_anomaly.values.astype(float).reshape(1, -1)

euclidean_distances = np.linalg.norm(df_values - selected_anomaly_values, axis=1)
sorted_indices = np.argsort(euclidean_distances)

print("Top 3 rows sorted by Euclidean Distance:")
for i in range(1,4):
    idx = sorted_indices[i]
    print(f"Index: {idx}, Euclidean Distance: {euclidean_distances[idx]}")

for idx in sorted_indices:
    if len(counterexample_indices) >= num_counterexamples:
        break
    if predictions[idx] == 1 and idx != selected_anomaly_index:  
        counterexample_indices.append(idx)


# Display the anomaly and its counterexamples
print("\nAnomaly:")
print(tabulate(df.iloc[selected_anomaly_index].to_frame().transpose(), headers='keys', tablefmt='grid', floatfmt=".8f"))
print("\n\n")

print("Counterexamples (Similar to Anomaly but Not Anomalies):")
for idx in counterexample_indices:
    print(tabulate(df.iloc[idx].to_frame().transpose(), headers='keys', tablefmt='grid', floatfmt=".8f"))
    print("\n")


Top 3 rows sorted by Euclidean Distance:
Index: 732, Euclidean Distance: 1011.7974105521322
Index: 1300, Euclidean Distance: 1577.3892988099037
Index: 815, Euclidean Distance: 1657.828097240483

Anomaly:
+----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+
|    |   MSSubClass |        LotArea |   OverallQual |   OverallCond |     YearBuilt |   YearRemodAdd |   GarageCars |       SalePrice |
|  2 |  60.00000000 | 11250.00000000 |    7.00000000 |    5.00000000 | 2001.00000000 |  2002.00000000 |   2.00000000 | 223500.00000000 |
+----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+



Counterexamples (Similar to Anomaly but Not Anomalies):
+-----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+
|     |   MSSubClass |        LotArea |   OverallQu

### Monte Carlo

In [90]:
OMEGA_1 = 0.5
OMEGA_2 = 1
OMEGA_3 = 1

def random_split(cols_names): 
    shuffled_columns = np.random.permutation(cols_names)
    split_point = np.random.randint(1, len(shuffled_columns))
    
    sim_cols = shuffled_columns[:split_point]
    diff_cols = shuffled_columns[split_point:]

    df_sim = subset_df[sim_cols]
    df_diff = subset_df[diff_cols]
    
    return df_sim.index.tolist(), df_diff.index.tolist()


# Explains how well D' explains a given sample (s) as an anomaly
def explanation_score(D_prime, s, F, F_diff, F_sim):
    # term_1 = (OMEGA_1 / len(D_prime)) * sum(sim(D_prime, r, F) for r in D_prime) # Measures the similarity between the subset D′ and a row r in D′
    term_2 = OMEGA_2 * (sim(D_prime, s, F_sim) * (len(F_sim)/ len(F))) # Measures the similarity restricted to the features in 𝐹_𝑠𝑖𝑚
    term_3 = OMEGA_3 * (sim(D_prime, s, F_diff) * (len(F_diff)/ len(F))) # Measures the similarity restricted to the features in 𝐹_𝑑𝑖𝑓𝑓
    
    # result = term_1 + term_2 - term_3
    result = term_2 + term_3
    return result

# Euclidean distances between D_prime and a sample (s) based on the specified features (features).
def sim(D_prime, s, features):
    # mean_vector = np.mean(D_prime, axis=0)
    # norm_value = np.linalg.norm(mean_vector - s)
    # s_E = 1 / (1 + norm_value)
    s_E = np.linalg.norm(D_prime[features] - s[features])
    return s_E

In [94]:
import numpy as np

best_F_diff, best_F_sim = None, None
best_score = np.inf
F = df.columns
rows_to_scores = {}
used_indices = set()

for i in range(1, int(len(df))):
    random_index = np.random.randint(0, len(df))  
    while random_index in used_indices:
        random_index = np.random.randint(0, len(df)) 

    used_indices.add(random_index)
    subset_df = df.iloc[random_index]
    F_diff, F_sim = random_split(F)
    
    score = explanation_score(subset_df, selected_anomaly, F, F_diff, F_sim)
    
    rows_to_scores[random_index] = score
    if score < best_score:
        best_score = score
        best_F_diff = F_diff
        best_F_sim = F_sim

        
# Sort the dictionary by its values
sorted_data = {k: v for k, v in sorted(rows_to_scores.items(), key=lambda item: item[1])}

row = 0
for key, value in sorted_data.items():
    if row < 10:
        print(f"{key:<10}{value:<20.10f}")
        row += 1
    else:
        break

2         0.0000000000        
815       646.6962776051      
732       885.3227342331      
1300      1247.0032499953     
374       1259.6610811464     
67        1303.9795371335     
1059      1358.9581504251     
147       1372.5001432665     
822       1415.0967231471     
776       1510.3090933062     


In [96]:
a = np.array([1, 2, 3, 4, 5, 6, 100])
b = np.array([1, 2, 3, 4, 5, 6])
c = np.array([100])

d = np.array([1, 1, 1, 1, 1, 1, 1])

all_vals = np.linalg.norm(a - d)
split_vals = np.linalg.norm(b - d[:len(b)]) + np.linalg.norm(c - d[len(b):])

print("all_vals:", all_vals)
print("split_vals:", split_vals)

all_vals: 99.27738916792686
split_vals: 106.41619848709567
