# Import Statements

In [1]:
import pandas as pd
import numpy as np

# Code

> avg_columns: $(df, str) \rightarrow df$

**Parameters**: A dataframe object and a string for the suffix of the column.

**Returns**: A dataframe with the average for the columns.

> concat_lookup: $(df, str) \rightarrow Maybe \ df$

**Parameters**: A dataframe object and a string for the identifier of the lookup table.

**Returns**: A dataframe object with the result of merging the df parameter and the lookup that has the identifier received. Can return None if an exception is thrown during read_csv().

In [2]:
def avg_columns(df, suffix):
    avg_series = df.mean(axis = 1)
    return pd.DataFrame(avg_series, columns = ["avgValue"+suffix])

def concat_lookup(df, df_id):  
  try:  
    new_df = (pd.read_csv('lookup_'+df_id+'.csv', index_col = 0)
              .pipe(avg_columns, suffix = df_id)
    )
  except Exception as e:
    print(f'Error: unexpected error when reading file": {e}')
    return None
  return df.merge(new_df, how='left', left_index=True, right_index=True).fillna(0)


In [3]:
lookup_df = (pd.read_csv('lookup_war.csv', index_col = 0)
    .pipe(concat_lookup, df_id = '1')
    .pipe(concat_lookup, df_id = '2')
    .pipe(concat_lookup, df_id = '3')
)
lookup_df

Unnamed: 0,degree,avgValue1,avgValue2,avgValue3
0x0000000000000000000000000000000000000000,579.0,2251.333333,1893.0,1288.333333
0x0000000000000000000000000000000000000001,16.0,5.000000,5.0,76.000000
0x0000000000000000000000000000000000000064,1.0,0.000000,0.0,0.333333
0x0000000000000000000000000000000000000800,2.0,0.000000,0.0,0.000000
0x0000000000000000000000000000000000001010,7.0,1.000000,5.0,3.666667
...,...,...,...,...
0xffffffff2ba8f66d4e51811c5190992176930278,133.0,2455.666667,871.0,492.333333
0xfffffffff15abf397da76f1dcc1a1604f45126db,67.0,1921.333333,407.5,191.666667
0xffffffffff2419497bf75e415bc9a7d446e05c0f,2.0,17.333333,55.0,13.000000
0xffffffffff402b1b62421a978cf93a56453d1496,1.0,0.000000,0.0,0.000000


> calc_df_distances: $(df, str) \rightarrow Maybe \ df$

**Parameters**: A dataframe object and a string containing the name of the target column.

**Returns**: A new dataframe with the target column and the distance between that column and all the others, or None if the target column does not exist.

In [4]:
def calc_df_distances(df_, target_col):
  df = df_.copy()
  col_list = list(df.columns)
  if target_col not in col_list: return None
  col_list.remove(target_col)
  for col in col_list:
    target_val = df[target_col].values
    col_val = df[col].values
    max_abs = np.maximum(np.abs(target_val), np.abs(col_val))
    new_col = np.abs(target_val - col_val) / max_abs
    df[col+'_dist'] = new_col
    df.drop([col], axis = 1, inplace = True)
  df['total_dist'] = df.loc[:, df.columns != target_col].sum(axis = 1)
  return df

In [5]:
df = (
    lookup_df
    .pipe(calc_df_distances, target_col = 'degree')
)
df

Unnamed: 0,degree,avgValue1_dist,avgValue2_dist,avgValue3_dist,total_dist
0x0000000000000000000000000000000000000000,579.0,0.742819,0.694136,0.550582,1.987538
0x0000000000000000000000000000000000000001,16.0,0.687500,0.687500,0.789474,2.164474
0x0000000000000000000000000000000000000064,1.0,1.000000,1.000000,0.666667,2.666667
0x0000000000000000000000000000000000000800,2.0,1.000000,1.000000,1.000000,3.000000
0x0000000000000000000000000000000000001010,7.0,0.857143,0.285714,0.476190,1.619048
...,...,...,...,...,...
0xffffffff2ba8f66d4e51811c5190992176930278,133.0,0.945840,0.847302,0.729858,2.522999
0xfffffffff15abf397da76f1dcc1a1604f45126db,67.0,0.965128,0.835583,0.650435,2.451146
0xffffffffff2419497bf75e415bc9a7d446e05c0f,2.0,0.884615,0.963636,0.846154,2.694406
0xffffffffff402b1b62421a978cf93a56453d1496,1.0,1.000000,1.000000,1.000000,3.000000


Now we have $df$ ready to evaluate as we want.

In [8]:
dist_lower_bound = .7
df = df.query('total_dist >= 2.5 and total_dist < 3')
df = df.query('avgValue1_dist > '+str(dist_lower_bound))
df = df.query('avgValue2_dist > '+str(dist_lower_bound))
df = df.query('avgValue3_dist > '+str(dist_lower_bound))
df.sort_values(by = 'total_dist', ascending = False).iloc[:10]

Unnamed: 0,degree,avgValue1_dist,avgValue2_dist,avgValue3_dist,total_dist
0x4c9af439b1a6761b8e549d8d226a468a6b2803a8,12396.0,1.0,1.0,0.999973,2.999973
0xcdf02971871b7736874e20b8487c019d28090019,5994.0,1.0,1.0,0.999944,2.999944
0x781229c7a798c33ec788520a6bbe12a79ed657fc,3054.0,1.0,1.0,0.999891,2.999891
0x6b0b3a982b4634ac68dd83a4dbf02311ce324181,2540.0,1.0,1.0,0.999869,2.999869
0x41d3ab85aafed2ef9e644cb7d3bbca2fc4d8cac8,1.0,1.0,1.0,0.999824,2.999824
0xa18607ca4a3804cc3cd5730eafefcc47a7641643,3.0,1.0,1.0,0.999812,2.999812
0xb9ee1e551f538a464e8f8c41e9904498505b49b0,2.0,0.99989,0.99995,0.999966,2.999807
0xb2f43262fc23d253538ca5f7b4890f89f0ee95d9,1.0,1.0,1.0,0.999737,2.999737
0x4d14b24edb751221b3ff08bbb8bd91d4b1c8bc77,1265.0,1.0,1.0,0.999736,2.999736
0xcfefa72a86f6d45786d23f5319beeacc75cdb5fe,4.0,1.0,1.0,0.999725,2.999725


In [9]:
lookup_df.loc['0x41d3ab85aafed2ef9e644cb7d3bbca2fc4d8cac8']

degree          1.000000
avgValue1       0.000000
avgValue2       0.000000
avgValue3    5684.666667
Name: 0x41d3ab85aafed2ef9e644cb7d3bbca2fc4d8cac8, dtype: float64