In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/knn_recommendations/similarity.csv', dtype={'code': str, 'id': str})

df = df.sample(frac=1).reset_index(drop=True) 

#add in some fake na
df.at[0, 'lat'] = None
df.at[0, 'long'] = None
df.at[0, 'code'] = None   

df

Unnamed: 0,id,lat,long,code
0,20112024,,,
1,21031341,53.383130,-2.596700,62020
2,10658658,51.504712,-0.616160,84210
3,20589266,54.971620,-1.424780,10831
4,19970548,51.350220,-0.259838,84130
...,...,...,...,...
7899,10463297,52.644070,1.235580,66190
7900,10549183,50.854930,0.577020,82190
7901,11191538,52.883580,0.000000,29202
7902,18297459,51.553910,-0.196010,47110


In [3]:
X = df.drop(columns=['id']).to_numpy()
y = df['id'].to_numpy()

print(X)
print(y)

[[nan nan None]
 [53.38313 -2.5967 '62020']
 [51.504712 -0.61616 '84210']
 ...
 [52.88358 0.0 '29202']
 [51.55391 -0.19601 '47110']
 [53.821195 -1.509443 '25120']]
['20112024' '21031341' '10658658' ... '11191538' '18297459' '16279472']


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)




In [5]:
transformer = ColumnTransformer([
        ('coords', Pipeline([
            ('impute_coords', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scale_coords', MinMaxScaler()),
          ]), [0, 1]),
        ('code', Pipeline([
            ('impute_code', SimpleImputer(strategy='constant', fill_value='None')),
            ('ohe_code', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
          ]), [2]),
    ], remainder='passthrough')

classifier = KNeighborsClassifier(n_neighbors=10)

model = Pipeline([
    ('transformer', transformer),
    ('classifier', classifier)
])

model.fit(X_train, y_train)
# model.transform(X_train, y_train)
print(model.predict([X_test[0, :]]))
model.score(X_test, y_test)

['10860338']


0.0

In [6]:
all_predictions = model.predict_proba(X)

def best_ten_predictions(preds, y):
    pairs = [(id, pred) for id, pred in zip(y, preds)]
    pairs.sort(key=lambda x: x[1], reverse=True)
    return [id for id, pred in pairs[:10]]

results = pd.DataFrame([ [k, best_ten_predictions(preds, y)] for k, preds in zip(y, all_predictions) ], columns=['id', 'predictions'])
results

Unnamed: 0,id,predictions
0,20112024,"[21030611, 16474181, 09609536, 17064470, 05327..."
1,21031341,"[16676257, 21111485, 00878887, 16026383, 17650..."
2,10658658,"[00904598, 09530125, 11543880, 06115623, 17823..."
3,20589266,"[17876081, 19214307, 15753351, 20941049, 09487..."
4,19970548,"[20105467, 21015499, 21084280, 09469797, 06042..."
...,...,...
7899,10463297,"[01077361, 03329171, 20681453, 16883616, 12950..."
7900,10549183,"[02756938, 15892323, 15722073, 16033105, 10787..."
7901,11191538,"[15293938, 02940897, 00299480, 04712308, 05950..."
7902,18297459,"[17351452, 07281486, 20955117, 11279607, 16952..."
