In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/recommendations/similarity.csv', dtype={'code': str, 'id': str})

df = df.sample(frac=1).reset_index(drop=True) 

#add in some fake na
df.at[0, 'lat'] = None
df.at[0, 'long'] = None
df.at[0, 'code'] = None   

df

Unnamed: 0,id,lat,long,code
0,07783345,,,
1,10056227,51.344010,0.000000,45200
2,12963867,54.971950,-1.657420,85100
3,00069555,51.266680,-0.254100,17220
4,16622449,52.890808,-2.152348,47782
...,...,...,...,...
7899,08735357,53.582000,-2.442530,25120
7900,11043697,54.238550,-0.758670,27120
7901,10483740,53.161661,-2.212648,75000
7902,18041874,53.209215,-0.117832,61900


In [3]:
X = df.drop(columns=['id']).to_numpy()
y = df['id'].to_numpy()

print(X)
print(y)

[[nan nan None]
 [51.34401 0.0 '45200']
 [54.97195 -1.65742 '85100']
 ...
 [53.161661 -2.212648 '75000']
 [53.209215 -0.117832 '61900']
 [50.73146 -1.91194 '84130']]
['07783345' '10056227' '12963867' ... '10483740' '18041874' '15842431']


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)




In [5]:
transformer = ColumnTransformer([
        ('coords', Pipeline([
            ('impute_coords', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scale_coords', MinMaxScaler()),
          ]), [0, 1]),
        ('code', Pipeline([
            ('impute_code', SimpleImputer(strategy='constant', fill_value='None')),
            ('ohe_code', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
          ]), [2]),
    ], remainder='passthrough')

classifier = KNeighborsClassifier(n_neighbors=10)

model = Pipeline([
    ('transformer', transformer),
    ('classifier', classifier)
])

model.fit(X_train, y_train)
# model.transform(X_train, y_train)
print(model.predict([X_test[0, :]]))
model.score(X_test, y_test)

['00537312']


0.0

In [6]:
all_predictions = model.predict_proba(X)

def best_ten_predictions(preds, y):
    pairs = [(id, pred) for id, pred in zip(y, preds)]
    pairs.sort(key=lambda x: x[1], reverse=True)
    return [id for id, pred in pairs[:10]]

results = pd.DataFrame([ [k, best_ten_predictions(preds, y)] for k, preds in zip(y, all_predictions) ], columns=['id', 'predictions'])
results

Unnamed: 0,id,predictions
0,07783345,"[09560140, 19306952, 08072249, 15293938, 17787..."
1,10056227,"[16398607, 19797700, 16079224, 17628954, 04745..."
2,12963867,"[17547698, 04732031, 13332763, 10698852, 02797..."
3,00069555,"[04162979, 16398270, 03903714, 08882578, 01808..."
4,16622449,"[04002913, 15954034, 17037966, 16598623, 06626..."
...,...,...
7899,08735357,"[04194434, 15950847, 16326900, 08097743, 17549..."
7900,11043697,"[16622449, 09602657, 17193976, 19810881, 16437..."
7901,10483740,"[05489721, 18365856, 20066664, 00228613, 08950..."
7902,18041874,"[03431952, 00034567, 07349344, 07619965, 10621..."
