In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/recommendations/similarity.csv', dtype={'code': str, 'id': str})
df = df.sample(frac=1).reset_index(drop=True) 
df

Unnamed: 0,id,lat,long,code
0,18704354,51.746120,-0.972340,74909
1,16809405,53.363044,-1.495492,74901
2,09476177,51.626830,0.000000,17240
3,17517305,53.426232,-1.421989,60200
4,16290661,51.520330,0.054360,88910
...,...,...,...,...
7899,05893625,52.026390,-0.773350,27400
7900,16430345,53.588950,-2.412460,23110
7901,16299176,51.763285,-3.154901,20420
7902,17569939,52.470630,0.959950,29202


In [3]:
grouped = df.groupby("code")

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

transformer = ColumnTransformer([
        ('coords', Pipeline([
            ('impute_coords', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scale_coords', MinMaxScaler()),
          ]), [0, 1]),
    ], remainder='passthrough')

classifier = KNeighborsClassifier(n_neighbors=10)

In [5]:
def best_ten_predictions(preds, y):
    pairs = [(i, pred) for i, pred in zip(y, preds)]
    pairs.sort(key=lambda x: x[1], reverse=True)
    best = [i for i, pred in pairs[:10]]
    # print(best)
    return best

In [6]:
all_results = pd.DataFrame([])

# create a KNN model for each code
for key, grp in grouped:
    # ignore  codes with < 11 items
    if grp.shape[0] < 11:
        continue
    print(key, end='\r')
    X = grp.drop(columns=['id', 'code']).to_numpy()
    y = grp['id'].to_numpy()

    model = Pipeline([
        ('transformer', transformer),
        ('classifier', classifier)
    ])
    model.fit(X, y)
    all_predictions = model.predict_proba(X)
    results = pd.DataFrame([ [k, best_ten_predictions(preds, y)] for k, preds in zip(y, all_predictions) ], columns=['id', 'predictions'])
    # save the best predictions
    all_results = pd.concat([all_results, results])
    
all_results

93191

Unnamed: 0,id,predictions
0,08294215,"[18503753, 17710934, 16079063, 17100770, 19003..."
1,20698027,"[19514947, 14474550, 19245308, 18486596, 18560..."
2,06628865,"[19514947, 14474550, 19593988, 19245308, 18486..."
3,19514947,"[06628865, 16952583, 14459977, 16178948, 19635..."
4,19504159,"[15286529, 18482610, 05003987, 14474550, 19245..."
...,...,...
95,17144320,"[09108011, 19643061, 08083087, 15633092, 09573..."
96,08666499,"[15929239, 09553311, 16210432, 08114276, 19278..."
97,16265185,"[15929239, 09553311, 16210432, 19278640, 18222..."
98,07432936,"[08123745, 15915473, 15979065, 19722669, 08697..."
