In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

## Imports

In [None]:
import os
import re
import cv2
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import Counter

from params import *

pd.options.display.max_columns = 100

## Load

In [None]:
df = pd.read_csv(DATA_PATH + "train.csv")

In [None]:
# dfg = df[["id", "point_of_interest", "country"]].groupby('point_of_interest').agg(list)
# dfg['country'] = dfg['country'].apply(lambda x: np.unique(x))

In [None]:
pairs = pd.read_csv(DATA_PATH + "pairs.csv")

pairs = pairs.merge(df[['id', 'point_of_interest']], left_on="id_1", right_on="id").drop('id', axis=1)
pairs = pairs.merge(
    df[['id', 'point_of_interest']],
    left_on="id_2",
    right_on="id",
    suffixes=('_1', '_2')
).drop('id', axis=1)

In [None]:
df_test = pd.read_csv(DATA_PATH + "test.csv")

In [None]:
df_test.head(1)

In [None]:
pairs.head()

### Cluster of pairs for folds

### Pois in pairs

In [None]:
pois = pairs[["point_of_interest_1", "point_of_interest_2"]].copy()
pois_ = pois.copy()
pois_.columns = ["point_of_interest_2", "point_of_interest_1"]

pois = pd.concat([pois, pois_]).reset_index(drop=True)

In [None]:
pois_merged = pois[["point_of_interest_1"]].drop_duplicates(keep="first").merge(
    pois, on="point_of_interest_1", how="left"
)

In [None]:
pois_merged = pois_merged.groupby('point_of_interest_1').agg(lambda x: sorted(np.unique(list(x)))).reset_index()

In [None]:
pois_merged['point_of_interest_2'] = pois_merged.apply(
    lambda x: list(set([x.point_of_interest_1] + list(x.point_of_interest_2))), 1
)

In [None]:
pois_merged['len'] = pois_merged['point_of_interest_2'].apply(len)

In [None]:
pois_merged = pois_merged[pois_merged['len'] > 1].reset_index()

In [None]:
pois_merged['pois'] = pois_merged['point_of_interest_2'].apply(lambda x: ' '.join(sorted(x)))

In [None]:
pois_merged = pois_merged.drop_duplicates(keep="first", subset='pois').copy().reset_index(drop=True)

In [None]:
pois_merged.head(10)

### POIs clusters

In [None]:
mapping = {}
num = 0
merges = {}

for i, pois in enumerate(tqdm(pois_merged['point_of_interest_2'])):    
    found = False

    assert len(pois) > 1
    
    for poi in pois:
        try:
            found_idx = mapping[poi]
            found = True
#             print(f'Found {poi} in clust {found_idx}')
            break
        except KeyError:
            pass

    if found:
        already_found_ids = []
        for poi in pois:
            try:
                already_found_ids.append(mapping[poi])
            except KeyError:
                pass
            
            mapping[poi] = found_idx

        already_found_ids = list(set(already_found_ids))
        try:
            already_found_ids.remove(found_idx)
        except:
            pass
        
        if len(already_found_ids):
            for k in mapping:
                if mapping[k] in already_found_ids:
                    mapping[k] = found_idx
             
    else:
        for poi in pois:            
            mapping[poi] = num
        num += 1
        
#     break

### Sanity Check

In [None]:
clusts = pd.DataFrame.from_dict(mapping, orient="index").reset_index()
clusts.columns = ["poi", "clust"]

In [None]:
pairs = pairs.merge(clusts, left_on="point_of_interest_1", right_on="poi")

pairs = pairs.merge(
    clusts,
    left_on="point_of_interest_2",
    right_on="poi",
    suffixes=('_1', '_2')
)

pairs[pairs['clust_1'] != pairs['clust_2']]

In [None]:
# clusts.to_csv(DATA_PATH + "poi_clusts.csv", index=False)

### Handling no clusts pois

In [None]:
df = pd.read_csv(DATA_PATH + "train.csv")

In [None]:
df = df.merge(clusts, left_on="point_of_interest", right_on="poi", how="left").drop('poi', axis=1)

In [None]:
to_map = list(set(df[df['clust'].isna()]['point_of_interest'].values))

current = int(np.max(df['clust'])) + 1
new_map = {i : k + current for k, i in enumerate(to_map)}

In [None]:
new_clusts = pd.DataFrame.from_dict(new_map, orient="index").reset_index()
new_clusts.columns = ["point_of_interest", "clust"]

In [None]:
df = df.merge(new_clusts, how="left", on="point_of_interest")

In [None]:
df.head()

In [None]:
df[['clust_x', 'clust_y']] = df[['clust_x', 'clust_y']].fillna(0)

In [None]:
df['clust'] = (df['clust_x'] + df['clust_y']).astype(int)
df.drop(['clust_x', 'clust_y'], axis=1, inplace=True)

### Check

In [None]:
pairs = pd.read_csv(DATA_PATH + "pairs.csv")

pairs = pairs.merge(df[['id', 'point_of_interest', 'clust']], left_on="id_1", right_on="id").drop('id', axis=1)
pairs = pairs.merge(
    df[['id', 'point_of_interest', 'clust']],
    left_on="id_2",
    right_on="id",
    suffixes=('_1', '_2')
).drop('id', axis=1)

In [None]:
pairs

In [None]:
pairs[pairs['clust_1'] != pairs['clust_2']]

### Save

In [None]:
# df.to_csv(DATA_PATH + "df_train.csv", index=False)

### Folds

In [None]:
df.head()

In [None]:
from sklearn.model_selection import *

In [None]:
K = 2

gkf = GroupKFold(n_splits=K)

splits = gkf.split(df, groups=df['clust'])


df_split = df[["id", "point_of_interest", "clust"]].copy()
df_split['fold'] = -1

for i, (_, val_idx) in enumerate(splits):
    df_split.loc[val_idx, 'fold'] = i
    
# df_split.to_csv(DATA_PATH + f"folds_{K}.csv", index=False)

In [None]:
dfg = df_split.groupby('clust').agg(list)
dfg['fold'].apply(lambda x: len(np.unique(x))).max()

### Matches

In [None]:
dfp = pairs[["id_1", "id_2", "point_of_interest_1", "point_of_interest_2", "match", "clust_1"]]

## Triplets

In [None]:
pairs = pd.read_csv(DATA_PATH + "pairs.csv")

pairs = pairs.merge(df[['id', 'point_of_interest', 'clust']], left_on="id_1", right_on="id").drop('id', axis=1)
pairs = pairs.merge(
    df[['id', 'point_of_interest', 'clust']],
    left_on="id_2",
    right_on="id",
    suffixes=('_1', '_2')
).drop('id', axis=1)

In [None]:
ids = pairs[["id_1", "id_2", "match"]].copy()
ids_ = ids.copy()
ids_.columns = ["id_2", "id_1", "match"]

ids = pd.concat([ids, ids_]).reset_index(drop=True)

In [None]:
triplets = ids[["id_1"]].drop_duplicates(keep="first").merge(
    ids, on="id_1", how="left"
)

triplets = triplets.groupby('id_1').agg(list).reset_index()

triplets = triplets[triplets['match'].apply(lambda x: True in x)]
# triplets = triplets[triplets['match'].apply(lambda x: True in x and False in x)]

triplets.columns = ['id', 'paired_ids', 'matches']

In [None]:
triplets.to_csv(DATA_PATH + 'triplets.csv', index=False)