# PIQ 1: 构建数据集

In [1]:
import pandas as pd

In [2]:
from functools import reduce

def load_transform_csv(csv_file):
    df = pd.read_csv(csv_file)
    df['img_index'] = df['Unnamed: 0']
    df = df[['img_index', 'fp_long']]
    df = df.set_index("img_index", drop=True)
    return df

def rbatch(a, b):
    csvfile = 'fingerprint_{}-{}.csv'.format(a[0], b)
    df = a[1].append(load_transform_csv(csvfile))
    return (b, df)

def mergeFpSaveTo(fpFile='fingerprint.csv'):
    csv_file = 'fingerprint_0-10000.csv'
    df = load_transform_csv(csv_file)

    csv_batches = range(20000, 120000, 10000)
    csv_batches = list(csv_batches)
    csv_batches.append(113891)

    last_batch, fp_df = reduce(rbatch, csv_batches, (10000, df))
    fp_df.to_csv(fpFile)

#### 图片边长与hash长度关系

| $k$ | 图片尺寸 $2 \times k$ | Hash长度 $k^2$ |
| -- | -- | -- |
| 2  | <span style="color: red">4</span> | 4 |
| 3  | 6 | 9 |
| 4  | 8 | 16 |
| 5  | 10 | 25 |
| 6  | 12 | 36 |
| 7  | 14 | 49 |
| 8  | <span style="color: red">16</span> | 64 |
| 9  | 18 | 81 |
| 10  | <span style="color: red">20</span> | 100 |
| 16  | <span style="color: red">32</span> | 256 |
| 20  | <span style="color: red">40</span> | 400 |

In [59]:
from pimquery import PImQuery, fp2des
from time import time
from helper import saveJson, loadJson
import numpy as np

In [4]:
piq = PImQuery()
# piq_k3 = PImQuery(hash_k=3, df_fp=piq.df_fp)
# piq_k4 = PImQuery(hash_k=4, df_fp=piq.df_fp)
# piq_k5 = PImQuery(hash_k=5, df_fp=piq.df_fp)
# piq_k6 = PImQuery(hash_k=6, df_fp=piq.df_fp)

In [5]:
start = time()

repeats = piq.findRepeats()

print('Cost time: {}s'.format(time() - start))

saveJson(repeats, 'data/piq_res_repeats_k2.json')

len(loadJson('data/piq_res_repeats_k2.json'))

Cost time: 3387.2563428878784s


31828

In [40]:
def repeats2Df(repeats, num):
    reps, simto = list(zip(*repeats))
    df = pd.DataFrame(dict(repeat=[0] * num, simto=[0] * num))
    df['repeat'][reps] = 1
    df['simto'][reps] = simto
    return df

In [41]:
repeats = loadJson('data/piq_res_repeats_k2.json')
df = repeats2Df(repeats, len(piq.df_hash))
df.to_csv('data/piq_predict_repeat_k2.csv', index=False)

In [53]:
predict_y = pd.read_csv('data/piq_predict_repeat_k2.csv')

In [58]:
predict_y[predict_y.repeat==1].count()

repeat    31828
simto     31828
dtype: int64

In [68]:
imdes = list()

start = time()

for fp in piq.df_fp['fp_long']:
    imdes.append(fp2des(fp).tolist())

print('Cost time: {}s'.format(time() - start))
    
saveJson(imdes, 'data/piq_imdes.json')

Cost time: 60.28981375694275s


In [None]:
piq_imdes = loadJson('data/piq_imdes.json')

len(piq_imdes)