In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from sklearn.neighbors import KNeighborsRegressor
from tqdm.notebook import tqdm
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import pickle
import lightgbm as lgbm
import Levenshtein
import difflib
import sklearn
import joblib
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

In [2]:
%%capture
# install reverse-geocode
!mkdir -p /tmp/pip/cache/
!cp ../input/reverse-geocode/reverse_geocode.xyz /tmp/pip/cache/reverse_geocode.tar.gz
!pip install /tmp/pip/cache/reverse_geocode.tar.gz

In [3]:
from sklearnex import patch_sklearn
patch_sklearn()
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import gc
import random
from sklearn.model_selection import GroupKFold
import warnings
import pickle
from unidecode import unidecode
import reverse_geocode
import string
import Levenshtein
import difflib
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer
import time
import joblib
from math import radians

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
!unzip ../input/foursquare-new-model-lgb/pykakasi_deps.dontopenthiskaggle/pykakasi_deps.dontopenthiskaggle

Archive:  ../input/foursquare-new-model-lgb/pykakasi_deps.dontopenthiskaggle/pykakasi_deps.dontopenthiskaggle
   creating: pykakasi_deps/
  inflating: pykakasi_deps/offline_deprecated.tar.bz2  
  inflating: pykakasi_deps/offline_jaconv.tar.bz2  
  inflating: pykakasi_deps/offline_pykakasi.tar.bz2  


In [6]:
!conda install ./pykakasi_deps/offline_pykakasi.tar.bz2
!conda install ./pykakasi_deps/offline_jaconv.tar.bz2
!conda install ./pykakasi_deps/offline_deprecated.tar.bz2


Downloading and Extracting Packages
######################################################################## | 100% 
Preparing transaction: - done
Verifying transaction: | done
Executing transaction: - done

Downloading and Extracting Packages
######################################################################## | 100% 
Preparing transaction: - done
Verifying transaction: | done
Executing transaction: - \ | / - \ | / - \ | / done

Downloading and Extracting Packages
######################################################################## | 100% 
Preparing transaction: - done
Verifying transaction: | done
Executing transaction: - \ | / - \ | / - done


In [7]:
feat_columns = ['name', 'address', 'city', 
            'state', 'zip', 'url', 
           'phone', 'categories', 'country']
vec_columns = ['name', 'categories','address','url','phone','country','state']

In [8]:
test = pd.read_csv('../input/foursquare-location-matching/test.csv')
if len(test) <= 5:
    NUM_SPLIT = 5
    k_neighbours = 5
else:
    NUM_SPLIT = 10
    k_neighbours = 25
    
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# test['number'] = test.address.str.extract('(\d+)')
def clean_country(cols):
    lat = cols[0]
    long = cols[1]
    country = cols[2]
    if country != country: # check if is nan
        coordinates = (lat, long),
        result = reverse_geocode.search(coordinates)
        return result[0]['country_code']
    return country
test['country'] = test[['latitude','longitude','country']].apply(clean_country, axis=1)

In [10]:
china = test.loc[test['country'] == 'CN']['name'].values
japan = test.loc[test['country'] == 'JP']['name'].values
def standard_data(df):
    columns = ['name', 'categories', 'country', 'zip', 'phone', 'url', 'city']
    
    for c in columns:        
        df[c] = df[c].astype(str).str.lower()
        df[c] = df[c].apply(lambda x: unidecode(x)) # will remove for china and japan later
        df[c] = df[c].astype(str).str.lower()
        if c in ['zip', 'phone', 'url']:
            df[c] = df[c].str.replace('[{}]'.format(string.punctuation), '')
            df[c] = df[c].str.replace(' ', '')
        if c == 'url':
            df[c] = df[c].str.replace('http://', '')
            df[c] = df[c].str.replace('https://', '')
            df[c] = df[c].str.replace('http:', '')
            df[c] = df[c].str.replace('https:', '')
            df[c] = df[c].str.replace('http', '')
            df[c] = df[c].str.replace('https', '')
            df[c] = df[c].str.replace('www.', '')
            df[c] = df[c].str.replace('www', '')
        df[c] = df[c].replace('nan', np.nan)
    return df
test = standard_data(test)
test.loc[test['country'] == 'cn', 'name'] = china
test.loc[test['country'] == 'jp', 'name'] = japan
del china, japan
gc.collect()
# del china, japan
# gc.collect()
# test['latitude_round'] = test['latitude'].round(1)
# test['longitude_round'] = test['longitude'].round(1)
test['latitude'] = test['latitude'].apply(lambda x: radians(x))
test['longitude'] = test['longitude'].apply(lambda x: radians(x))

In [11]:
import pykakasi
def convert_japanese_alphabet(df: pd.DataFrame):
    kakasi = pykakasi.kakasi()
    kakasi.setMode('H', 'a')  # Convert Hiragana into alphabet
    kakasi.setMode('K', 'a')  # Convert Katakana into alphabet
    kakasi.setMode('J', 'a')  # Convert Kanji into alphabet
    conversion = kakasi.getConverter()

    def convert(row):
        for column in ["name", "address", "city", "state"]:
            try:
                row[column] = conversion.do(row[column])
            except:
                pass
        return row

    df[df["country"] == "JP"] = df[df["country"] == "JP"].apply(convert, axis=1)
    return df

# test = convert_japanese_alphabet(test)

In [12]:
from transformers import DistilBertModel, DistilBertTokenizer,AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch.nn as nn
import torch
MAX_LEN = 32

class Cat2VecModel(nn.Module):
    def __init__(self):
        super(Cat2VecModel, self).__init__()
        self.distill_bert = DistilBertModel.from_pretrained("../input/distilbertbaseuncased/")
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x
    
cat2vec_model = Cat2VecModel()
cat2vec_model = cat2vec_model.cuda()

from torch.utils.data import DataLoader, Dataset

import pickle
def get_vec_col(col,bert_model='distilbert-base-uncased'):
        if col == 'text_features':
          MAX_LEN = 150
        else:
          MAX_LEN = 32

        class Cat2VecModel(nn.Module):
            def __init__(self):
                super(Cat2VecModel, self).__init__()
                self.distill_bert = AutoModel.from_pretrained('../input/distilbertbaseuncased/')

            def forward(self, ids, mask):
                x = self.distill_bert(ids, mask)[0]
                x = F.normalize((x[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
                return x

        cat2vec_model = Cat2VecModel()
        cat2vec_model = cat2vec_model.cuda()

        class InferenceDataset(Dataset):
            def __init__(self, df, max_len):
                super().__init__()
                self.df = df.reset_index(drop=True)
                self.max_len = max_len
                self.tokenizer = AutoTokenizer.from_pretrained('../input/distilbertbaseuncased/',do_lower_case=True)

            def __getitem__(self, index):
                row = self.df.iloc[index]

                inputs = self.tokenizer.encode_plus(
                    row[col],
                    None,
                    add_special_tokens=True,
                    max_length=MAX_LEN,
                    padding="max_length",
                    return_token_type_ids=True,
                    truncation=True
                )
                ids = torch.LongTensor(inputs['input_ids'])
                mask = torch.LongTensor(inputs['attention_mask'])

                return ids, mask

            def __len__(self):
                return self.df.shape[0]
        cat_df = test[[col]].drop_duplicates()
        cat_df[col] = cat_df[col].fillna("nan")

        cat_ds = InferenceDataset(cat_df, max_len=MAX_LEN)


        import sys
        BS = 256
        NW = 2    

        def inference(ds):
            loader = DataLoader(ds, batch_size=BS, shuffle=False, num_workers=NW,
                                pin_memory=False, drop_last=False)
            tbar = tqdm(loader, file=sys.stdout)

            vs = []
            with torch.no_grad():
                for idx, (ids, masks) in enumerate(tbar):
                    v = cat2vec_model(ids.cuda(), masks.cuda()).detach().cpu().numpy()
                    vs.append(v)
            return np.concatenate(vs)


        V = inference(cat_ds)

        tmp_dict = {}
        count = 0
        for i in range(len(cat_df)):
            tmp_dict[cat_df[col].values[i]] = V[count]
            count += 1
        np.save(f'./{col}_{bert_model}_catvec.npy',tmp_dict)

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def add_neighbor_features(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(1,neighbors):      
            cur_df = country_df[['id']]
            cur_df['near_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(1,Neighbors):            
        cur_df = df[['id']]
        cur_df['near_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'near_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [14]:
%load_ext Cython

In [15]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [16]:
import Levenshtein
import difflib
import multiprocessing

def add_features(df):    
    for col in tqdm(feat_columns):       
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['near_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        if col in vec_columns:
            tmp_dict = np.load(f'./{col}_distilbert-base-uncased_catvec.npy',allow_pickle=True).item()
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['near_id']]   
            df[f'{col}_{bert_model}_cat_vec_sim'] = [(tmp_dict[test[col].values[indexs[i]]] * tmp_dict[test[col].values[match_indexs[i]]]).sum() for i in range(len(indexs))]
        
        col_values = test.loc[df['id']][col].values.astype(str)
        matcol_values = test.loc[df['near_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

In [17]:
def recall_simple(df):
    threshold = 2
    
    val2id_d = {}
    for col in rec_columns:
        temp_df = df[['id', col]]
#         temp_df[col] = temp_df[col].str.lower()
        val2id = temp_df.groupby(col)['id'].apply(set).to_dict()
        val2id_d[col] = val2id
        del val2id
    
    cus_ids = []
    match_ids = []
    for vals in tqdm(df[rec_columns + ['id']].fillna('null').values):
        cus_id = vals[-1]
        match_id = []
        
        rec_match_count = []
        for i in range(len(rec_columns)):
            col = rec_columns[i]
            
            if vals[i] != 'null':
                rec_match_count += list(val2id_d[col][vals[i].lower()])
        rec_match_count = dict(Counter(rec_match_count))
        
        for k, v in rec_match_count.items():
            if v > threshold:
                match_id.append(k)
        
        cus_ids += [cus_id] * len(match_id)
        match_ids += match_id
    
    train_df = pd.DataFrame()
    train_df['id'] = cus_ids
    train_df['near_id'] = match_ids
    train_df = train_df.drop_duplicates()
    del cus_ids, match_ids
    
    num_data = len(train_df)
    num_data_per_id = num_data / train_df['id'].nunique()
    print('Num of data: %s' % num_data)
    print('Num of data per id: %s' % num_data_per_id)
    
    return train_df

In [18]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df



In [19]:
def add_tfidf_feature_test(train,model_path):
    name_tf = joblib.load(model_path + 'name_tf.pkl')
    V_name = name_tf.transform(train["name"].fillna("noname"))
    V_name_near = name_tf.transform(train["near_name"].fillna("noname"))
    cat_tf = joblib.load(model_path + 'cat_tf.pkl')
    V_cat = cat_tf.transform(train['categories'].fillna("nocategory"))
    V_cat_near = cat_tf.transform(train["near_categories"].fillna("nocategory"))
    train['name_tfidf'] = np.array(V_name.multiply(V_name_near).sum(axis=1)).reshape(-1,)
    train['cat_tfidf'] = np.array(V_cat.multiply(V_cat_near).sum(axis=1)).reshape(-1,)
    return train , name_tf , cat_tf 

def submit_lgb(test,model_path,features,n_splits=5):
    preds = []
    for fold in tqdm(range(n_splits)):
        print(f'==fold{fold}==')
        model = joblib.load(model_path + f'lgbm_fold{fold}_v7.pkl')
        pred = model.predict_proba(test[features])[:,1]
#         pred = model.predict(test[features])
        preds.append(pred)
        test['pred'] = np.array(preds).mean(axis=0)
#     test['pred'] = [1 if i > 0.5 else 0 for i in test['pred']]
    return test[['id','near_id','pred']]

def process_features(df):
    features = []
    num_features = ['kdist', 'kneighbors', 'kdist_country', 'kneighbors_country', 
       'name_sim','name_distilbert-base-uncased_cat_vec_sim', 'name_gesh', 'name_leven',
       'name_jaro', 'name_lcs', 'name_len_diff', 'name_nleven', 'name_nlcsk',
       'name_nlcs', 'address_sim', 'address_distilbert-base-uncased_cat_vec_sim',
       'address_gesh', 'address_leven', 'address_jaro', 'address_lcs',
       'address_len_diff', 'address_nleven', 'address_nlcsk', 'address_nlcs',
       'city_gesh', 'city_leven', 'city_jaro', 'city_lcs', 'city_len_diff',
       'city_nleven', 'city_nlcsk', 'city_nlcs', 'state_sim',
       'state_distilbert-base-uncased_cat_vec_sim', 'state_gesh', 'state_leven',
       'state_jaro', 'state_lcs', 'state_len_diff', 'state_nleven',
       'state_nlcsk', 'state_nlcs', 'zip_gesh', 'zip_leven', 'zip_jaro',
       'zip_lcs', 'url_sim','url_distilbert-base-uncased_cat_vec_sim', 'url_gesh', 'url_leven',
       'url_jaro', 'url_lcs', 'url_len_diff', 'url_nleven', 'url_nlcsk',
       'url_nlcs', 'phone_sim', 'phone_distilbert-base-uncased_cat_vec_sim',
       'phone_gesh', 'phone_leven', 'phone_jaro', 'phone_lcs',
       'categories_sim', 'categories_distilbert-base-uncased_cat_vec_sim',
       'categories_gesh', 'categories_leven', 'categories_jaro',
       'categories_lcs', 'categories_len_diff', 'categories_nleven',
       'categories_nlcsk', 'categories_nlcs', 'country_sim',
       'country_distilbert-base-uncased_cat_vec_sim', 'country_gesh',
       'country_leven', 'country_jaro', 'country_lcs', 'country_len_diff',
       'country_nleven', 'country_nlcsk', 'country_nlcs','latitude_y',
       'longitude_y', 'latdiff', 'londiff', 'manhattan', 'euclidean', 'haversine',
       'kdist_diff', 'kneighbors_mean', 'sim_sum', 'gesh_sum', 'leven_sum',
       'jaro_sum', 'lcs_sum', 'sim_std', 'gesh_std', 'leven_std', 'jaro_std',
       'lcs_std', 'name_1', 'categories_1', 'address_1', 'state_1', 'url_1',
       'country_1', 'name_2', 'categories_2', 'address_2', 'state_2', 'url_2',
       'country_2', 'info_power_1', 'info_power_2', 'info_diff']
#     num_features =    ['kdist', 'kdist_country', 'name_sim',
#         'name_cat_vec_sim', 'haversine', 'name_jaro', 'latitude_x',
#        'name_gesh', 'categories_cat_vec_sim', 'name_nlcsk', 'name_nlcs',
#        'kdist_diff', 'euclidean', 'sim_sum', 'longitude_x', 'manhattan',
#        'name_nleven', 'name_lcs', 'lcs_std', 'categories_sim',
#        'categories_lcs', 'name_len_diff', 'latitude_y', 'sim_std',
#        'kneighbors', 'name_leven', 'lcs_sum', 'categories_leven',
#        'categories_nlcs', 'categories_jaro', 'categories_gesh', 'leven_std',
#        'jaro_sum', 'categories_nlcsk', 'address_cat_vec_sim',
#        'kneighbors_mean', 'leven_sum', 'jaro_std', 'categories_len_diff',
#        'gesh_sum', 'phone_gesh', 'longitude_y', 'categories_nleven',
#        'city_len_diff', 'gesh_std', 'address_len_diff', 'address_nlcs',
#        'address_sim', 'address_nlcsk', 'address_jaro', 'kneighbors_country',
#        'address_gesh', 'phone_lcs', 'url_cat_vec_sim', 'address_lcs',
#        'state_cat_vec_sim', 'url_len_diff', 'city_lcs', 'city_leven',
#        'info_diff', 'address_leven', 'info_power_1', 'city_gesh',
#        'address_nleven', 'info_power_2', 'url_lcs', 'state_len_diff',
#        'state_lcs', 'city_jaro', 'zip_lcs', 'phone_jaro', 'zip_gesh',
#        'url_gesh', 'zip_jaro', 'city_nlcs', 'latdiff', 'url_sim',
#        'state_leven', 'state_2', 'url_jaro', 'city_nleven', 'state_1',
#        'phone_leven', 'city_nlcsk', 'zip_leven', 'url_nleven', 'state_jaro',
#        'state_nleven', 'url_nlcs', 'state_gesh', 'state_sim', 'url_nlcsk',
#        'url_leven', 'address_1', 'state_nlcs', 'categories_1', 'londiff',
#        'country_cat_vec_sim', 'state_nlcsk', 'address_2', 'categories_2',
#        'country_sim', 'url_2', 'url_1', 'country_gesh'
#     ] 
    
    df[num_features] = df[num_features].astype('float16')

    cat_features = [ 'city_x', 'state_x', 'country_x', 'categories_x','city_y', 'state_y', 'country_y', 'categories_y']
    # cat_features = [ 'city_x','city_y']
    df[cat_features] = df[cat_features].astype('category')
 
    features = num_features + cat_features

    return df,features
    
def get_submit_all_series(test,model_path,n_splits):
#     test_tmp , name_tf , cat_tf = add_tfidf_feature_test(test,model_path)
    test_tmp,features = process_features(test)
    test_tmp = submit_lgb(test_tmp,model_path,features,n_splits)
    return test_tmp

In [20]:
def add_extra_features(train):
  train['latdiff'] = train['latitude_x'] - train['latitude_y']
  train['londiff'] = train['longitude_x'] - train['longitude_y']
  train['manhattan'] = manhattan(train['latitude_x'], train['longitude_x'], train['latitude_y'], train['longitude_y'])
  train['euclidean'] = (train['latitude_x'] ** 2 + train['longitude_x'] ** 2) ** 0.5
  train['haversine'] = vectorized_haversine(train['latitude_x'], train['longitude_x'], train['latitude_y'], train['longitude_y'])
  train['kdist_diff'] = (train['kdist'] - train['kdist_country']) / train['kdist_country']
  train['kneighbors_mean'] = train[['kneighbors', 'kneighbors_country']].mean(axis = 1)
  train['sim_sum'] = train[[col for col in train.columns if 'sim' in col]].sum(axis=1)
  train['gesh_sum'] = train[[col for col in train.columns if 'gesh' in col]].sum(axis=1)
  train['leven_sum'] = train[[col for col in train.columns if '_leven' in col]].sum(axis=1)
  train['jaro_sum'] = train[[col for col in train.columns if 'jaro' in col]].sum(axis=1)
  train['lcs_sum'] = train[[col for col in train.columns if '_lcs' in col]].sum(axis=1)

  train['sim_std'] = train[[col for col in train.columns if 'sim' in col]].std(axis=1)
  train['gesh_std'] = train[[col for col in train.columns if 'gesh' in col]].std(axis=1)
  train['leven_std'] = train[[col for col in train.columns if '_leven' in col]].std(axis=1)
  train['jaro_std'] = train[[col for col in train.columns if 'jaro' in col]].std(axis=1)
  train['lcs_std'] = train[[col for col in train.columns if '_lcs' in col]].std(axis=1)

  train = train.merge(tdata, on='id', how='left')
  train = train.merge(tdata, left_on='near_id', right_on='id', how='left', suffixes=['_1','_2'])
  train = train.drop('id_2', axis=1).rename(columns={'id_1':'id'})
  train['info_power_1'] = train[[col for col in train.columns if '_1' in col]].lt(1).sum(axis=1)
  train['info_power_2'] = train[[col for col in train.columns if '_2' in col]].lt(1).sum(axis=1)
  train['info_diff'] = train['info_power_1'] - train['info_power_2']
    
  return train

In [21]:
test_1 = add_neighbor_features(test,Neighbors = k_neighbours)
test = reduce_mem_usage(test)

rec_columns = ['name', 'address', 'categories', 'address', 'phone']
columns = ['id', 'name', 'address', 'city', 'state','zip', 'country', 'url', 'phone', 'categories']
for c in columns:
    if c != 'id':
        test[c] = test[c].astype(str).str.lower()

tfidf_d = {}
for col in vec_columns:
    if col == 'categories':
        tfidf = TfidfVectorizer(use_idf=False)
    else:
        tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
        tv_fit = tfidf.fit_transform(test[col].fillna(f'no{col}'))
    tfidf_d[col] = tv_fit

Start knn grouped by country


  0%|          | 0/4 [00:00<?, ?it/s]

Start knn
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 9.9%


In [22]:
bert_models = ['distilbert-base-uncased']
for col in vec_columns:
    for bert_model in bert_models: 
        get_vec_col(col)

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._c

  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertM

  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertM

  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertM

  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Some weights of the model checkpoint at ../input/distilbertbaseuncased/ were not used when initializing DistilBertM

  0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
# from collections import Counter
# test_data_simple = recall_simple(test)
# test_data_simple = test_data_simple[test_data_simple['id'] != test_data_simple['near_id']]
# test_1 = test_1.merge(test_data_simple, on = ['id', 'near_id'],how = 'outer')
# del test_data_simple
# test_1 = test_1.drop_duplicates(subset=['id','near_id'])
id2index_d = dict(zip(test['id'].values, test.index))

In [24]:
tdata = test.copy()
tdata['name'] = tdata['name'].isna().astype(int)
tdata['categories'] = tdata['categories'].isna().astype(int)
tdata['address'] = tdata['address'].isna().astype(int)
tdata['state'] = tdata['state'].isna().astype(int)
tdata['url'] = tdata['url'].isna().astype(int)
tdata['country'] = tdata['country'].isna().astype(int)

tdata = tdata[['id','name', 'categories', 'address', 'state', 
               'url', 'country']].drop_duplicates().reset_index(drop=True)

In [25]:
test = test.set_index('id')

In [26]:
def manhattan(lat1, long1, lat2, long2):
    return np.abs(lat2 - lat1) + np.abs(long2 - long1)

# get haversine distance
def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

In [27]:
## Prediction
import gc
count = 0
start_row = 0
pred_df = pd.DataFrame()
unique_id = test_1['id'].unique().tolist()
num_split_id = len(unique_id) // NUM_SPLIT
pred_df = pd.DataFrame()
for k in range(1, NUM_SPLIT + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < NUM_SPLIT:
        cur_id = unique_id[start_row : end_row]
        cur_data = test_1[test_1['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = test_1[test_1['id'].isin(cur_id)]
    cur_data = cur_data.merge(test.reset_index()[['id','city','state','country','categories','latitude','longitude']],on='id',how='left')
    cur_data = cur_data.merge(test.reset_index()[['id','city','state','country','categories','latitude','longitude']],left_on='near_id',right_on='id',how='left')
    cur_data = cur_data.drop(columns=['id_y'])
    cur_data.rename(columns={'id_x':'id'},inplace=True)
    cur_data = add_features(cur_data)
    cur_data = add_extra_features(cur_data)
    cur_data = reduce_mem_usage(cur_data)
#     for col in [ 'city_x', 'state_x', 'country_x', 'categories_x','city_y', 'state_y', 'country_y', 'categories_y']:
#         le = joblib.load(f'../input/foursquare-new-model-lgb/model/{col}_label_encode.joblib')
#         cur_data.loc[cur_data[col] == 'nan',col] = np.nan
#         cur_data[col] = le.transform(cur_data[col])
    cur_pred_df = get_submit_all_series(cur_data,'../input/foursquare-new-model-lgb/v12_model/',5)
    cur_pred_df = reduce_mem_usage(cur_pred_df)
    pred_df = pd.concat([pred_df, cur_pred_df])
    
    start_row = end_row
    count += len(cur_data)

    del cur_data, cur_pred_df
    gc.collect()
print(count)

Current split: 1


Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times



  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 44.4%


  0%|          | 0/5 [00:00<?, ?it/s]

==fold0==
==fold1==
==fold2==
==fold3==
==fold4==
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 18.8%
Current split: 2


  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 60.2%


  0%|          | 0/5 [00:00<?, ?it/s]

==fold0==
==fold1==
==fold2==
==fold3==
==fold4==
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 18.8%
Current split: 3


  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 53.3%


  0%|          | 0/5 [00:00<?, ?it/s]

==fold0==
==fold1==
==fold2==
==fold3==
==fold4==
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 18.8%
Current split: 4


  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 46.3%


  0%|          | 0/5 [00:00<?, ?it/s]

==fold0==
==fold1==
==fold2==
==fold3==
==fold4==
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 18.8%
Current split: 5


  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 62.1%


  0%|          | 0/5 [00:00<?, ?it/s]

==fold0==
==fold1==
==fold2==
==fold3==
==fold4==
Memory usage of dataframe is 0.00 MB
Memory usage after optimization is: 0.00 MB
Decreased by 18.8%
20


In [28]:
pred_df['pred'] = [1 if i > 0.7 else 0 for i in pred_df['pred']]
pred_df = pred_df[~pred_df['near_id'].isnull()][pred_df['pred'] == 1]
tmp = pred_df.groupby('id')['near_id'].apply(list).reset_index()
tmp.columns = ['id','matches']
if len(tmp) > 0:
    tmp['matches'] = tmp.apply(lambda x : [x['id']] + x['matches'],axis=1)

In [29]:
ssub = pd.read_csv("../input/foursquare-location-matching/sample_submission.csv")
ssub = ssub.drop(columns="matches")
ssub = ssub.merge(tmp,on='id',how='left')
ssub.loc[~ssub['matches'].isnull(),'matches'] = ssub.loc[~ssub['matches'].isnull(),'matches'].apply(lambda x : ' '.join(x))
ssub.loc[ssub['matches'].isnull(),'matches'] = ssub.loc[ssub['matches'].isnull(),'id']

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in tqdm(df["matches"]):
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    return df 

#train = postprocess(train)
ssub = postprocess(ssub)
ssub.to_csv('submission.csv',index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
ssub

Unnamed: 0,id,matches
0,E_00001118ad0191,E_00001118ad0191
1,E_000020eb6fed40,E_000020eb6fed40
2,E_00002f98667edf,E_00002f98667edf
3,E_001b6bad66eb98,E_001b6bad66eb98
4,E_0283d9f61e569d,E_0283d9f61e569d
