In [124]:
import pandas as pd
import numpy as np

In [125]:
df = pd.read_csv('../../data/knn_recommendations/company-similarity.csv', dtype={'code': str, 'id': str})

df = df.sample(frac=1).reset_index(drop=True) 

#add in some fake na
df.at[0, 'lat'] = None
df.at[0, 'long'] = None
df.at[0, 'code'] = None   

df

Unnamed: 0,id,lat,long,code
0,15815697,,,
1,04998966,51.288509,-0.850192,84210
2,15748966,51.351990,0.000000,28923
3,01861357,51.414200,-0.752670,08110
4,13324102,51.538687,-0.729496,58120
...,...,...,...,...
7899,10426415,52.500720,-2.029190,78100
7900,00472132,50.815540,-1.140540,46770
7901,03481810,51.988111,-2.286752,01420
7902,14837638,51.511560,0.000000,66190


In [126]:
X = df.drop(columns=['id']).to_numpy()
y = df['id'].to_numpy()

print(X)
print(y)

[[nan nan None]
 [51.288509 -0.850192 '84210']
 [51.35199 0.0 '28923']
 ...
 [51.988111 -2.286752 '01420']
 [51.51156 0.0 '66190']
 [51.49355 -0.24233 '26800']]
['15815697' '04998966' '15748966' ... '03481810' '14837638' '17300130']


In [127]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)




In [131]:
transformer = ColumnTransformer([
        ('coords', Pipeline([
            ('impute_coords', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scale_coords', MinMaxScaler()),
          ]), [0, 1]),
        ('code', Pipeline([
            ('impute_code', SimpleImputer(strategy='constant', fill_value='None')),
            ('ohe_code', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
          ]), [2]),
    ], remainder='passthrough')

classifier = KNeighborsClassifier(n_neighbors=10)

model = Pipeline([
    ('transformer', transformer),
    ('classifier', classifier)
])

model.fit(X_train, y_train)
# model.transform(X_train, y_train)
print(model.predict([X_test[0, :]]))
model.score(X_test, y_test)

['01077361']


0.0

In [150]:
all_predictions = model.predict_proba(X)

def best_ten_predictions(preds, y):
    pairs = [(id, pred) for id, pred in zip(y, preds)]
    pairs.sort(key=lambda x: x[1], reverse=True)
    return [id for id, pred in pairs[:10]]

results = pd.DataFrame([ [k, best_ten_predictions(preds, y)] for k, preds in zip(y, all_predictions) ], columns=['id', 'predictions'])
results

Unnamed: 0,id,predictions
0,15815697,"[19084892, 17037966, 04836896, 21019382, 00076..."
1,04998966,"[04832238, 08367011, 18440733, 02904020, 16196..."
2,15748966,"[10890235, 19070806, 16191914, 16169769, 04890..."
3,01861357,"[05058283, 16927760, 16211160, 16580385, 04853..."
4,13324102,"[00023067, 15699287, 21067458, 18400357, 00303..."
...,...,...
7899,10426415,"[19289496, 16480064, 18209800, 17800646, 00022..."
7900,00472132,"[16242953, 06077928, 04722954, 20035835, 09498..."
7901,03481810,"[04727262, 01868312, 09530125, 08956473, 10638..."
7902,14837638,"[08645105, 16793115, 20318801, 07250374, 03308..."
