In [15]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('../../data/knn_recommendations/similarity.csv', dtype={'code': str, 'id': str})
df = df.sample(frac=1).reset_index(drop=True) 
df

Unnamed: 0,id,lat,long,code
0,21081483,53.573579,-2.573582,43110
1,20026781,53.442759,-1.950924,55100
2,18381810,51.374550,0.000000,47110
3,20072079,51.521050,-0.108650,13960
4,04206228,52.293659,-1.655601,23110
...,...,...,...,...
7899,10962288,51.591620,0.497530,80200
7900,16328585,52.637570,1.329880,87200
7901,09607509,51.197197,-0.615825,05101
7902,20314073,53.362202,-0.688112,01430


In [17]:
grouped = df.groupby("code")

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

transformer = ColumnTransformer([
        ('coords', Pipeline([
            ('impute_coords', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scale_coords', MinMaxScaler()),
          ]), [0, 1]),
    ], remainder='passthrough')

classifier = KNeighborsClassifier(n_neighbors=10)

In [19]:
def best_ten_predictions(preds, y):
    pairs = [(i, pred) for i, pred in zip(y, preds)]
    pairs.sort(key=lambda x: x[1], reverse=True)
    best = [i for i, pred in pairs[:10]]
    # print(best)
    return best

In [20]:
all_results = pd.DataFrame([])

# create a KNN model for each code
for key, grp in grouped:
    # ignore  codes with < 11 items
    if grp.shape[0] < 11:
        continue
    print(key, end='\r')
    X = grp.drop(columns=['id', 'code']).to_numpy()
    y = grp['id'].to_numpy()

    model = Pipeline([
        ('transformer', transformer),
        ('classifier', classifier)
    ])
    model.fit(X, y)
    all_predictions = model.predict_proba(X)
    results = pd.DataFrame([ [k, best_ten_predictions(preds, y)] for k, preds in zip(y, all_predictions) ], columns=['id', 'predictions'])
    # save the best predictions
    all_results = pd.concat([all_results, results])
    
all_results

93191

Unnamed: 0,id,predictions
0,19593988,"[17818776, 20671233, 04972684, 14480733, 16079..."
1,18761165,"[17710934, 14801065, 17056896, 20238217, 18486..."
2,07474951,"[07588430, 09426720, 17879286, 20893065, 19956..."
3,17932043,"[20698027, 09426720, 17879286, 19499062, 19920..."
4,07588430,"[18761165, 20698027, 09426720, 17879286, 19514..."
...,...,...
95,09520508,"[09573298, 08300880, 19215155, 07881890, 18070..."
96,08072249,"[09573298, 08300880, 19215155, 15815697, 07881..."
97,09279939,"[07051793, 19278640, 18937870, 13207514, 08987..."
98,05993271,"[15705227, 07803036, 04766054, 18222104, 17869..."
