In [None]:
!unzip -q /content/drive/MyDrive/Kaggle/Foursquare/data/foursquare-location-matching.zip

In [None]:
!pip install -q Levenshtein
!pip install -q transformers

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from sklearn.neighbors import KNeighborsRegressor
from tqdm.notebook import tqdm
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import pickle
import lightgbm as lgbm
import Levenshtein
import difflib
import sklearn
import joblib
from sklearn.neighbors import NearestNeighbors
import warnings 
warnings.filterwarnings('ignore')
import string
from math import radians
import gc

ModuleNotFoundError: ignored

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [None]:
columns = ['id', 'name', 'address', 'city', 'state','zip', 'country', 'url', 'phone', 'categories']
for c in columns:
  if c != 'id':
    train[c] = train[c].astype(str).str.lower()

In [None]:
kf = GroupKFold(n_splits=5)
for i, (trn_idx, val_idx) in enumerate(kf.split(train, train['point_of_interest'], train['point_of_interest'])):
    train.loc[val_idx, "set"] = i
train["set"].value_counts()

1.0    227763
0.0    227763
2.0    227762
3.0    227762
4.0    227762
Name: set, dtype: int64

In [None]:
def add_neighbor_features(df,Neighbors=10,cols=['latitude','longitude']):
    print('Start knn grouped by country')
    train_df_country = []
    # df[cols] = np.deg2rad(df[cols])
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(1,neighbors):      
            cur_df = country_df[['id']]
            cur_df['near_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(1,Neighbors):            
        cur_df = df[['id']]
        cur_df['near_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
    on = ['id', 'near_id'],
    how = 'outer')
    del train_df_country
    
    return train_df

In [None]:
train_df = pd.concat([
    add_neighbor_features(train[train["set1"]==0]), 
    add_neighbor_features(train[train["set1"]==1]), 
    add_neighbor_features(train[train["set"]==2]), 
    add_neighbor_features(train[train["set"]==3]), 
    add_neighbor_features(train[train["set"]==4]), 
])
train_df = train_df[~train_df['near_id'].isnull()][train_df['id'] != train_df['near_id']].reset_index(drop=True)

Start knn grouped by country


100%|██████████| 210/210 [01:07<00:00,  3.13it/s]


Start knn
Start knn grouped by country


100%|██████████| 211/211 [01:12<00:00,  2.90it/s]


Start knn


In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']
vec_columns = ['name', 'categories','address','url','phone','country','state']

## Train data generated by knn
id2index_d = dict(zip(train['id'].values, train.index))


tfidf_d = {}
for col in vec_columns:
  if col == 'categories':
    tfidf = TfidfVectorizer(use_idf=False)
  else:
    tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
  tv_fit = tfidf.fit_transform(train[col].fillna(f'no{col}'))
  tfidf_d[col] = tv_fit
  

!pip install -q transformers
from transformers import DistilBertModel, DistilBertTokenizer , AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import pickle

def get_vec_col(col,bert_model='distilbert-base-uncased'):
    if col == 'text_features':
      MAX_LEN = 150
    else:
      MAX_LEN = 32

    class Cat2VecModel(nn.Module):
        def __init__(self):
            super(Cat2VecModel, self).__init__()
            self.distill_bert = AutoModel.from_pretrained(bert_model)
            
        def forward(self, ids, mask):
            x = self.distill_bert(ids, mask)[0]
            x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
            return x
        
    cat2vec_model = Cat2VecModel()
    cat2vec_model = cat2vec_model.cuda()
        
    class InferenceDataset(Dataset):
        def __init__(self, df, max_len):
            super().__init__()
            self.df = df.reset_index(drop=True)
            self.max_len = max_len
            self.tokenizer = AutoTokenizer.from_pretrained(bert_model,do_lower_case=True)

        def __getitem__(self, index):
            row = self.df.iloc[index]

            inputs = self.tokenizer.encode_plus(
                row[col],
                None,
                add_special_tokens=True,
                max_length=MAX_LEN,
                padding="max_length",
                return_token_type_ids=True,
                truncation=True
            )
            ids = torch.LongTensor(inputs['input_ids'])
            mask = torch.LongTensor(inputs['attention_mask'])

            return ids, mask

        def __len__(self):
            return self.df.shape[0]
    cat_df = train[[col]].drop_duplicates()
    cat_df[col] = cat_df[col].fillna("nan")

    cat_ds = InferenceDataset(cat_df, max_len=MAX_LEN)

    
    import sys
    BS = 256
    NW = 2    

    def inference(ds):
        loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                            pin_memory=False, drop_last=False)
        tbar = tqdm(loader, file=sys.stdout)

        vs = []
        with torch.no_grad():
            for idx, (ids, masks) in enumerate(tbar):
                v = cat2vec_model(ids.cuda(), masks.cuda()).detach().cpu().numpy()
                vs.append(v)
        return np.concatenate(vs)


    V = inference(cat_ds)
    
    tmp_dict = {}
    count = 0
    for i in range(len(cat_df)):
        tmp_dict[cat_df[col].values[i]] = V[count]
        count += 1
    np.save(f'/content/drive/MyDrive/Kaggle/Foursquare/data/newdata/{col}_{bert_model}_catvec.npy',tmp_dict)

train['text_features'] = train['name'].fillna('nan') + '[sep]' + train['address'].fillna('nan') + '[sep]' + train['categories'].fillna('nan')

bert_models = ['distilbert-base-uncased']
for col in vec_columns:
  for bert_model in bert_models: 
    get_vec_col(col)

Unnamed: 0,id,near_id,kdist,kneighbors,kdist_country,kneighbors_country
0,E_000001272c6c5d,E_3ef360d8d73dcf,0.000024,1.0,0.000022,2.0
1,E_000002eae2a589,E_c12fb799a8a0e4,0.000002,1.0,0.000002,1.0
2,E_000007f24ebc95,E_b664578fe0e8c2,0.000005,1.0,0.000005,1.0
3,E_000008a8ba4f48,E_c03acb4032c33d,0.000004,1.0,0.000004,2.0
4,E_00001d92066153,E_7e0d8e9138dd56,0.000002,1.0,0.000002,1.0
...,...,...,...,...,...,...
10890518,E_4c57a92c895eef,E_1827a3004c29ef,,,1.126334,6.0
10890519,E_6d33113f66fd27,E_1827a3004c29ef,,,1.181093,6.0
10890520,E_8c77abc0296481,E_1827a3004c29ef,,,1.180593,6.0
10890521,E_b639518f5a5a60,E_1827a3004c29ef,,,1.162739,6.0


In [None]:
%load_ext Cython

In [None]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [None]:
def categorical_similarity(A, B):
    if not A or not B:
        return -1

    A = set(str(A).split(", "))
    B = set(str(B).split(", "))

    # Find intersection of two sets
    nominator = A.intersection(B)

    similarity_1 = len(nominator) / len(A)
    similarity_2 = len(nominator) / len(B)

    return max(similarity_1, similarity_2)

In [None]:
import Levenshtein
import difflib
import multiprocessing

def add_features(df):   
    for col in tqdm(feat_columns):       
        if col in vec_columns:
            if col != 'text_features':
              tv_fit = tfidf_d[col]
              indexs = [id2index_d[i] for i in df['id']]
              match_indexs = [id2index_d[i] for i in df['near_id']]                    
              df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
       
        if col in vec_columns:
            for bert_model in bert_models:
              tmp_dict = np.load(f'/content/drive/MyDrive/Kaggle/Foursquare/data/newdata/{col}_{bert_model}_catvec.npy',allow_pickle=True).item()
              indexs = [id2index_d[i] for i in df['id']]
              match_indexs = [id2index_d[i] for i in df['near_id']]   
              df[f'{col}_{bert_model}_cat_vec_sim'] = [(tmp_dict[train[col].values[indexs[i]]] * tmp_dict[train[col].values[match_indexs[i]]]).sum() for i in range(len(indexs))]


        col_values = train.loc[df['id']][col].values.astype(str)
        matcol_values = train.loc[df['near_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        ifnull = []
        
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':         
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
    return df

In [None]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [None]:
import gc
gc.collect()

160

In [None]:
import gc
train = train.set_index('id')
ids = train_df['id'].tolist()
match_ids = train_df['near_id'].tolist()

poi = train.loc[ids]['point_of_interest'].values
match_poi = train.loc[match_ids]['point_of_interest'].values

train_df['label'] = np.array(poi == match_poi, dtype = np.int8)
del poi, match_poi, ids, match_ids
gc.collect()
print('Num of unique id: %s' % train_df['id'].nunique())
print('Num of train data: %s' % len(train_df))
print('Pos rate: %s' % train_df['label'].mean())
print(train.sample(5))

Num of unique id: 1138812
Num of train data: 10895964
Pos rate: 0.068207824475191
                               name  latitude  longitude    address  \
id                                                                    
E_2a6ddaf1c86eff              月出松公園  0.620037   2.435767  都筑区加賀原1-4   
E_55c99c7eb95487             bildel  1.046829   0.409031        nan   
E_03ec82952fa8fa  ruang ii/5 fe uii -0.135443   1.927040        nan   
E_af8e5bfc9500f3  kwyetiiywpaakhmaa  0.286525   1.765464        nan   
E_b3e3eb8ee1f4be  chlidaa plaaephaa  0.239974   1.756095        nan   

                                city          state      zip country  url  \
id                                                                          
E_2a6ddaf1c86eff      heng bang shi            神奈川県  2240055      jp  nan   
E_55c99c7eb95487          tammisaari         nyland    10600      fi  nan   
E_03ec82952fa8fa  sleman, yogyakarta      indonesia    55283      id  nan   
E_af8e5bfc9500f3   mueang phetchabu

In [None]:
## Eval
data = train.reset_index()

id2poi = get_id2poi(data)
poi2ids = get_poi2ids(data)

eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['near_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = train_df[train_df['label'] == 1][['id', 'near_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['near_id'].apply(list).reset_index()
eval_df['matches'] = eval_df['near_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

Unique id: 1138812
Unique id: 1138812
IoU score: 0.8983671494512845


In [None]:
feat_columns = ['name', 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories', 'country']

for i in range(5):
    tmp_ids = train[train['set'] == i].index 
    cur_data = train_df[train_df['id'].isin(tmp_ids)]
    cur_data = add_features(cur_data)
    cur_data = cur_data.merge(train.reset_index()[['id','city','state','country','categories','latitude','longitude','zip', 'url', 'address','phone']],on='id',how='left')
    cur_data = cur_data.merge(train.reset_index()[['id','city','state','country','categories','latitude','longitude','zip', 'url', 'address','phone',]],left_on='near_id',right_on='id',how='left')
    cur_data = cur_data.drop(columns=['id_y'])
    cur_data.rename(columns={'id_x':'id'},inplace=True)
    cur_data.to_csv('/content/drive/MyDrive/Kaggle/Foursquare/data/newdata/train_data_v8_%s.csv' % i, index = False)    
    del cur_data
    gc.collect()

100%|██████████| 9/9 [16:00<00:00, 106.74s/it]
100%|██████████| 9/9 [15:28<00:00, 103.13s/it]
100%|██████████| 9/9 [15:18<00:00, 102.02s/it]
100%|██████████| 9/9 [15:45<00:00, 105.08s/it]
100%|██████████| 9/9 [15:46<00:00, 105.14s/it]
