In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

In [1]:
from scipy.special import softmax
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

class SchoolManifold():

    def __init__(self, embeddings_df: pd.DataFrame, k = 10, weight_mode = "softmax"):
        self.embeddings_df = embeddings_df
        self.embeddings = embeddings_df.iloc[:, 1:]
        self.k = k
        self._compute_neighbors(k, weight_mode=weight_mode) 
    
    def _compute_neighbors(self, k: int):
        knn = NearestNeighbors(n_neighbors = k, algorithm="auto")
        knn.fit(self.embeddings.values)
        dists, indices = knn.kneighbors()
        self.neighbor_dists = dists
        self.neighbor_indices = indices


    def preprocess_for_target(self, target_df: pd.DataFrame, target_col_ind: int = None, target_col_lab = None):
        if (target_col_ind is not None):
            target_col_lab = target_df.columns[target_col_ind]

        if (target_col_lab is None):
            raise ValueError("Either target_col_ind or target_col_lab must not be None")

        self.target_df = target_df
        self.target_X = target_df.drop(labels = ["unitid", target_col_lab], axis = 1)
        self.target_y = target_df[target_col_lab]

        merged = pd.merge(self.embeddings_df.reset_index(), target_df, how= "right", on = "unitid").set_index('index')
        inds_of_target_in_original = merged.index.values
        neighbor_inds_of_target_in_original = self.neighbor_indices[inds_of_target_in_original]
        neighbor_dists_of_target_col = self.neighbor_dists[inds_of_target_in_original]

        vals_of_target_col = merged[target_col_lab].values
        neighbor_vals_of_target_col =  vals_of_target_col[neighbor_inds_of_target_in_original]

        mask = neighbor_vals_of_target_col > vals_of_target_col.reshape(-1, 1)
        neighbor_weights_of_target_col = np.where(mask, neighbor_dists_of_target_col, np.PINF)
        neighbor_weights_of_target_col = softmax(-neighbor_weights_of_target_col, axis = 1)

        self.v_star = (neighbor_dists_of_target_col * neighbor_weights_of_target_col).sum(axis = 1)


KeyboardInterrupt: 

In [None]:
ic_mca = pd.read_csv("../datasets/Clusters/IC_CLUSTERS_AND_MCA.csv",index_col= 0)
ic_mca = ic_mca.loc[:, ic_mca.columns != "ic_cluster"]

### Testing Area

In [23]:
manifold = SchoolManifold(ic_mca)

In [76]:
manifold.neighbor_indices[[2,3]]

array([[6304, 6035, 5222, 6275, 5733, 5732, 6727, 5762, 6573, 6117],
       [1133, 1782, 1345, 3417, 1978, 1757, 3500, 1455,  543,  763]])

In [86]:
test = np.array([[1,2, 3], [2,3, 4], [3,4, 5], [4, 5, 6]])
mask = test > np.array([2, 3, 4, 5]).reshape(-1, 1)

altered_test = np.where(mask, test, np.PINF)
softmax(-altered_test, axis = 1)

array([[-inf, -inf,  -3.],
       [-inf, -inf,  -4.],
       [-inf, -inf,  -5.],
       [-inf, -inf,  -6.]])

In [87]:
df = pd.DataFrame(np.random.randn(50, 1), columns=list('A'))
df["E"] = 2*np.arange(0, 50)
df2 = pd.DataFrame(np.random.randn(100, 1), columns=list('B'))
df2 = pd.DataFrame(np.random.randn(100, 1), columns=list('B'))

In [74]:
df3 = pd.merge(df2.reset_index(), df,  how="right", on = "E").set_index('index')
df3