**Imports**

In [231]:
import pandas as pd
import numpy as np
import math
import heapq

from sklearn.ensemble import IsolationForest
from tabulate import tabulate

from functools import lru_cache


---
## Preprocessing

In [143]:
df = pd.read_csv('house_prices_train.csv')
selected_columns = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 
                    'YearBuilt', 'YearRemodAdd', 'GarageCars', 'SalePrice']
df = df[selected_columns]

In [144]:
model = IsolationForest(n_estimators=100, max_samples=0.5,
                        contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None,
                        verbose=1, random_state=2020)
model.fit(df)

In [145]:
df

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,GarageCars,SalePrice
0,60,8450,7,5,2003,2003,2,208500
1,20,9600,6,8,1976,1976,2,181500
2,60,11250,7,5,2001,2002,2,223500
3,70,9550,7,5,1915,1970,3,140000
4,60,14260,8,5,2000,2000,3,250000
...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,2000,2,175000
1456,20,13175,6,6,1978,1988,2,210000
1457,70,9042,7,9,1941,2006,1,266500
1458,20,9717,5,6,1950,1996,1,142125


In [146]:
predictions = model.predict(df)
anomaly_indices = np.where(predictions == -1)[0]

selected_anomaly_index = 2
selected_anomaly = df.iloc[selected_anomaly_index]

---
## Counter Examples  
*Run through all the rows in the data, calculate the **Euclidean distance** of each row from the given anomaly, and select examples with the minimal distances that are tagged as regular examples and not anomalies.*

In [147]:
num_counterexamples = 5
counterexample_indices = []

In [219]:
df_values = df.values.astype(float) 
selected_anomaly_values = selected_anomaly.values.astype(float).reshape(1, -1)

euclidean_distances = np.linalg.norm(df_values - selected_anomaly_values, axis=1)
sorted_indices = np.argsort(euclidean_distances)


for idx in sorted_indices:
    if len(counterexample_indices) >= num_counterexamples:
        break
    if predictions[idx] == 1 and idx != selected_anomaly_index:  
        counterexample_indices.append(idx)


# Display the anomaly and its counterexamples
print("\nAnomaly:")
print(tabulate(df.iloc[selected_anomaly_index].to_frame().transpose(), headers='keys', tablefmt='grid', floatfmt=".8f"))
print("\n\n")

print("Counterexamples (Similar to Anomaly but Not Anomalies):")
for idx in counterexample_indices:
    print(tabulate(df.iloc[idx].to_frame().transpose(), headers='keys', tablefmt='grid', floatfmt=".8f"))
    print("\n")



Anomaly:
+----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+
|    |   MSSubClass |        LotArea |   OverallQual |   OverallCond |     YearBuilt |   YearRemodAdd |   GarageCars |       SalePrice |
|  2 |  60.00000000 | 11250.00000000 |    7.00000000 |    5.00000000 | 2001.00000000 |  2002.00000000 |   2.00000000 | 223500.00000000 |
+----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+



Counterexamples (Similar to Anomaly but Not Anomalies):
+-----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+
|     |   MSSubClass |        LotArea |   OverallQual |   OverallCond |     YearBuilt |   YearRemodAdd |   GarageCars |       SalePrice |
| 732 |  60.00000000 | 11404.00000000 |    7.00000000 |    5.00000000 | 1998.00000000 |  1999.00000000 |   

---
## Monte Carlo
### Using Substrat ideas - Random Split and Explanation Score  
*Given the full dataset, **randomly split the columns into "diff" and "same"** groups.  
Then, select **one row at a time** and calculate how well the row explains the given anomaly using a **similarity method**.  
Identify the 5 rows with the highest explanation scores. These are the rows that will be chosen to explain the anomaly.*


In [152]:
OMEGA_1 , OMEGA_2 , OMEGA_3 = 1 , 0 , 0

# Split the data RANDOMLY into similar and different groups
def random_split(cols_names): 
    shuffled_columns = np.random.permutation(cols_names)
    split_point = np.random.randint(1, len(shuffled_columns))

    sim_cols = shuffled_columns[:split_point]
    diff_cols = shuffled_columns[split_point:]

    # Return indices of the split DataFrame subsets
    return subset_df[sim_cols].index.tolist(), subset_df[diff_cols].index.tolist()


# Explains how well D' explains a given sample (s) as an anomaly
def explanation_score(D_prime, s, F, F_diff, F_sim):
    D_prime_len = len(D_prime)
    
    term_1 = 0 # Measures the similarity between the subset D′ and a row r in D′
    for i in range (0, D_prime_len):
        term_1 = term_1 + sim(D_prime, df.iloc[i], F)
    term1 = term_1 * (OMEGA_1 / len(D_prime))

    term_2 = OMEGA_2 * (sim(D_prime, s, F_sim) * (len(F_sim)/ len(F))) # Measures the similarity restricted to the features in 𝐹_𝑠𝑖𝑚
    term_3 = OMEGA_3 * (sim(D_prime, s, F_diff) * (len(F_diff)/ len(F))) # Measures the similarity restricted to the features in 𝐹_𝑑𝑖𝑓𝑓
    
    result = term_1 + term_2 - term_3
    return result

# Euclidean distances between D_prime and a sample (s) based on the specified features (features).
def sim(D_prime, s, features):
    # mean_vector = np.mean(D_prime, axis=0)
    # norm_value = np.linalg.norm(mean_vector - s)
    # s_E = 1 / (1 + norm_value)
    s_E = np.linalg.norm(D_prime[features] - s[features])
    return s_E

In [153]:
best_F_diff, best_F_sim = None, None
best_score = np.inf
cols_names = df.columns
rows_to_scores = {}

for i in range(1, int(len(df))*2):
    random_index = np.random.randint(0, len(df)) 
    subset_df = df.iloc[random_index]
    F_diff, F_sim = random_split(cols_names)
    
    score = explanation_score(subset_df, selected_anomaly, cols_names.tolist(), F_diff, F_sim)
    
    rows_to_scores[random_index] = score
    if score < best_score:
        best_score = score
        best_F_diff = F_diff
        best_F_sim = F_sim

        
# Sort the dictionary by its values
sorted_data = {k: v for k, v in sorted(rows_to_scores.items(), key=lambda item: item[1])}

row = 0
for key, value in sorted_data.items():
    if row <= num_counterexamples:
        print(f"{key:<10}{value:<20.10f}")
        row += 1
    else:
        break

7         325028.0521856965   
143       325202.5708799603   
94        325270.8088938369   
452       325281.5478314809   
1111      325352.2303734593   
100       325405.0460867200   


### Monte Carlo _(using Substrat ideas)_ **VS** Counterexamples
*As we can see, the results of the Monte Carlo method are dramatically different from those of the Counterexamples method.  
This discrepancy might be due to the hyperparameters (OMEGA_1-3).  
However, even if we adjust the hyperparameter values to match those used in the counterexample calculation, we would still need more iterations to achieve similar results.  
Therefore, for now (until we find better hyperparameters or a new, improved method), we will **prefer to use the Counterexamples method**.*

---
## Simple statistical methods
# To Do: add comments

In [249]:
mean = df.mean()
std_dev = df.std()
min_delta_z_scores = []
for index, row in df.iterrows():
    cur_delta = abs(row - selected_anomaly).mean()
    if predictions[index] == 1:
        if len(min_delta_z_scores) < 10:
            heapq.heappush(min_delta_z_scores, (-cur_delta, index))
        else:
            heapq.heappushpop(min_delta_z_scores, (-cur_delta, index))

min_delta_z_scores.sort()
print(min_delta_z_scores)


# Display the anomaly and its counterexamples
print("\nAnomaly:")
print(tabulate(df.iloc[selected_anomaly_index].to_frame().transpose(), headers='keys', tablefmt='grid', floatfmt=".8f"))
print("\n\n")

print("Rows with the maximum z scores:")
for _,idx in min_delta_z_scores:
    print(tabulate(df.iloc[idx].to_frame().transpose(), headers='keys', tablefmt='grid', floatfmt=".8f"))
    print("\n")

[(-408.125, 929), (-391.0, 67), (-365.5, 572), (-343.25, 147), (-330.875, 822), (-291.75, 815), (-261.125, 776), (-249.125, 1300), (-145.0, 732), (-0.0, 2)]

Anomaly:
+----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+
|    |   MSSubClass |        LotArea |   OverallQual |   OverallCond |     YearBuilt |   YearRemodAdd |   GarageCars |       SalePrice |
|  2 |  60.00000000 | 11250.00000000 |    7.00000000 |    5.00000000 | 2001.00000000 |  2002.00000000 |   2.00000000 | 223500.00000000 |
+----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+



Rows with the maximum z scores:
+-----+--------------+----------------+---------------+---------------+---------------+----------------+--------------+-----------------+
|     |   MSSubClass |        LotArea |   OverallQual |   OverallCond |     YearBuilt |   YearRemodAdd |   Garag

---
## Entropy Calculation
*Calculate the entropy value of the given anomaly example.  
Iterate through all the rows and calculate the entropy value for each one.  
Save the **rows with entropy values most similar** to that of the given anomaly example.*

In [206]:
# p_j(v) calculation
def calc_prob_of_val(col_name, val):
    prob = (len(df[df[col_name] == val]))/ rows_amount
    return prob

probs = []
rows_amount = len(df)

def calc_row_entropy(given_row):
    res = 0
    for col in df:
        cur_prob = calc_prob_of_val(col, given_row[col])
        res = res + (cur_prob * math.log(cur_prob,2))
    return res

anomaly_entropy_val = calc_row_entropy(selected_anomaly)
print(anomaly_entropy_val)
similar_entropy_rows = [(-1, float('inf'))] * 5
for index, row in df.iterrows():
    row_entropy_val = calc_row_entropy(row)
    max_delta_tuple = max(similar_entropy_rows, key=lambda x: abs(x[1] - anomaly_entropy_val)) # Find the row with the maximum delta from the given anomaly entropy
    max_delta_value = max_delta_tuple[1]
    if (row_entropy_val < max_delta_value):
        max_delta_index = similar_entropy_rows.index(max_delta_tuple)
        similar_entropy_rows[max_delta_index] = (index,row_entropy_val)

for tup in similar_entropy_rows:
    print(tup)




-2.1638663664912516
(281, -2.5804767786146634)
(10, -2.2254235551733075)
(2, -2.1638663664912516)
(8, -2.209034790477028)
(6, -2.234731100817478)


---
***TODO: COMBINE MONTE CARLO AND ENTROPY CALCULATION***

