In [None]:

import csv
import cv2
import hashlib
import itertools
import memcache
import multiprocessing as mp
import os
import sys

from zipfile import ZipFile


def get_image(x, mc, zipfiles):
    f = mc.get("file:"+x)
    if f is not None:
        return f
    fbname = "00" + x
    f = zipfiles["%01d" % int(fbname[-2])].read("Images_%01d/%01d/%s.jpg" % (int(fbname[-2]), int(fbname[-2:]), x))
    mc.set("file:"+x, f, time=10)
    return f
    
def get_hash(x):
    import hashlib
    
    return hashlib.md5(x).hexdigest()

def get_cv2_hist(x):
    import cv2
    import numpy as np
    
    return cv2.normalize(
        cv2.calcHist([
                cv2.imdecode(np.frombuffer(x, np.uint8), 1)
            ], [0, 1, 2], None, [8, 8, 8],
                     [0, 256, 0, 256, 0, 256])
    ).flatten()

def get_skimg(x):
    from StringIO import StringIO
    from skimage.io import imread
    
    return imread(StringIO(x))

def process_image_pair(args):
    index, item1, item2, x, y = args
    import sys
    import warnings
    from scipy.spatial import distance as dist
    from skimage import color, exposure, measure
    from skimage.feature import (match_descriptors, corner_harris,
                             corner_peaks, BRIEF, ORB, plot_matches)
    
    result = []

    try:
        result.append(("Shape", float(get_skimg(x).shape == get_skimg(y).shape)))
    except:
        result.append(("Shape", None))
        sys.excepthook(*sys.exc_info())
    
    try:
        result.append(("Exact", float(get_hash(x) == get_hash(y))))
    except:
        result.append(("Exact", None))
        sys.excepthook(*sys.exc_info())
    
    try:
        cv2histx = get_cv2_hist(x)
        cv2histy = get_cv2_hist(y)
    except:
        sys.excepthook(*sys.exc_info())

    OPENCV_METHODS = (
        ("Correlation", cv2.cv.CV_COMP_CORREL),
        ("Chi-Squared", cv2.cv.CV_COMP_CHISQR),
        ("Intersection", cv2.cv.CV_COMP_INTERSECT),
        ("Hellinger", cv2.cv.CV_COMP_BHATTACHARYYA))
    for n, m in OPENCV_METHODS:
        try:
            result.append((n, round(cv2.compareHist(cv2histx, cv2histy, m), 6)))
        except:
            result.append((n, None))
            sys.excepthook(*sys.exc_info())

    SCIPY_METHODS = (
        ("scipy.braycurtis", dist.braycurtis),
        ("scipy.canberra", dist.canberra),
        ("scipy.chebyshev", dist.chebyshev),
        ("scipy.cityblock", dist.cityblock),
        ("scipy.cosine", dist.cosine),
#         ("scipy.dice", dist.dice),
        ("scipy.euclidean", dist.euclidean),
        ("scipy.hamming", dist.hamming),
        ("scipy.jaccard", dist.jaccard),
        ("scipy.sqeuclidean", dist.sqeuclidean),
    )
    for n, m in SCIPY_METHODS:
        try:
            result.append((n, round(m(cv2histx, cv2histy), 6)))
        except:
            result.append((n, None))
            sys.excepthook(*sys.exc_info())

    try:
        skimgx = get_skimg(x)
        skimgy = get_skimg(y)
    except:
        sys.excepthook(*sys.exc_info())
        
    try:
        skimgxa = exposure.equalize_adapthist(color.rgb2gray(skimgx), clip_limit=0.01)
        skimgya = exposure.equalize_adapthist(color.rgb2gray(skimgy), clip_limit=0.01)
    except:
        sys.excepthook(*sys.exc_info())

    SKIMAGE_METHODS = (
        ("skimage.compare_mse", measure.compare_mse),
        ("skimage.compare_ssim_3", lambda x, y: measure.compare_ssim(x, y, win_size=3, multichannel=True)),
        ("skimage.compare_ssim_5", lambda x, y: measure.compare_ssim(x, y, win_size=5, multichannel=True)),
        ("skimage.compare_ssim_7", lambda x, y: measure.compare_ssim(x, y, win_size=7, multichannel=True)),
    )
            
    for n, m in SKIMAGE_METHODS:
        try:
            if result[0][1]:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    result.append((n, round(m(skimgx, skimgy), 6)))
            else:
                result.append((n, None))
        except:
            result.append((n, None))
            sys.excepthook(*sys.exc_info())

    SKIMAGE_METHODS = (
        ("skimage.compare_mse.adapt", measure.compare_mse),
        ("skimage.compare_ssim_3.adapt", lambda x, y: measure.compare_ssim(x, y, win_size=3)),
        ("skimage.compare_ssim_5.adapt", lambda x, y: measure.compare_ssim(x, y, win_size=5)),
        ("skimage.compare_ssim_7.adapt", lambda x, y: measure.compare_ssim(x, y, win_size=7)),
    )
            
    for n, m in SKIMAGE_METHODS:
        try:
            if result[0][1]:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    result.append((n, round(m(skimgxa, skimgya), 6)))
            else:
                result.append((n, None))
        except:
            result.append((n, None))
            sys.excepthook(*sys.exc_info())
            
#     try:
#         descriptor_extractor = ORB(n_keypoints=200)
#         descriptor_extractor.detect_and_extract(skimgxa)
#         keypoints1 = descriptor_extractor.keypoints
#         descriptors1 = descriptor_extractor.descriptors
#         descriptor_extractor.detect_and_extract(skimgya)
#         keypoints2 = descriptor_extractor.keypoints
#         descriptors2 = descriptor_extractor.descriptors
#         result.append(("ORB", float(len(match_descriptors(descriptors1, descriptors2, cross_check=True)))))
#     except:
#         result.append(("ORB", None))
#         sys.excepthook(*sys.exc_info())
    
#     try:
#         keypoints1 = corner_peaks(corner_harris(skimgxa), min_distance=5)
#         keypoints2 = corner_peaks(corner_harris(skimgya), min_distance=5)
#         extractor = BRIEF()
#         extractor.extract(skimgxa, keypoints1)
#         keypoints1 = keypoints1[extractor.mask]
#         descriptors1 = extractor.descriptors
#         extractor.extract(skimgya, keypoints2)
#         keypoints2 = keypoints2[extractor.mask]
#         descriptors2 = extractor.descriptors
#         result.append(("BRIEF", float(len(match_descriptors(descriptors1, descriptors2, cross_check=True)))))
#     except:
#         result.append(("BRIEF", None))
#         sys.excepthook(*sys.exc_info())
    
    return index, item1, item2, result


def process_images_arrays(index, data, mc, zipfiles):
    results = []
    itemID_1, itemID_2, imgsx, imgsy = data
    for imgx, imgy in itertools.product(str(imgsx).split(", "), str(imgsy).split(", ")):
        if all([imgx.strip(), imgy.strip()]):
            results.append((imgx, imgy, process_image_pair(imgx, imgy, mc, zipfiles)))
    return index, data, results

class DumpArchive(object):
    def __init__(self, path):
        if not os.path.exists(path):
            raise Exception("Path %s does not exists" % path)
        self.path = path

    def read(self, filename):
        return open(os.path.join(self.path, filename), "rb").read()

def process_images(data_dir="."):
    from zipfile import ZipFile
    import csv
    import functools
    import numpy as np
    import multiprocessing as mp
    import pandas as pd
    import sys
    
    pool = mp.Pool(8)
    mc = memcache.Client(['127.0.0.1:11211'], debug=0)
    for mode in ["train", "test"]:
        fn = os.path.join(data_dir, "ItemPairs_%s.csv_ImgCmp.csv" % mode)
        oldindex = None
        if os.path.exists(fn):
            continue
            oldindex = np.unique(pd.read_csv(fn)['id'].values)
        params = {}
        zipfiles = {}
        for _ in xrange(10):
            sys.stderr.write(str(_) + " ")
            try:
                zipfiles[str(_)] = ZipFile(os.path.join(data_dir, "Images_%d.zip" % _))
            except:
                zipfiles[str(_)] = DumpArchive(data_dir)
        sys.stderr.write("\n")
        sys.stderr.write(fn + "\n")
        if mode == "test":
            params["index_col"] = 0
        imgarr = pd.read_csv(os.path.join(data_dir, "ItemInfo_%s.csv_images_array.csv" % mode), index_col=0)
        pr = pd.read_csv(os.path.join(data_dir, "ItemPairs_%s.csv" % mode), **params)
        pr = pd.merge(pr, imgarr, left_on="itemID_1", right_index=True, how="inner", sort=False)
        pr = pd.merge(pr, imgarr, left_on="itemID_2", right_index=True, how="inner", sort=False)
        headers = None
        if oldindex is not None:
            print "Drop indexes", oldindex
            pr.drop(oldindex, inplace=True)
            headers = True
        with open(fn, "a") as fo:
            w = csv.writer(fo)
            for index, itemID_1, itemID_2, res in pool.imap(process_image_pair, (
                (index, itemID_1, itemID_2, get_image(imgx, mc, zipfiles), get_image(imgy, mc, zipfiles))
                for index, (itemID_1, itemID_2, imgsx, imgsy) in itertools.izip(pr.index, iter(pr[["itemID_1", "itemID_2", "images_array_x", "images_array_y"]].fillna("").values))
                for imgx, imgy in itertools.product(str(imgsx).split(", "), str(imgsy).split(", "))
                if all([imgx.strip(), imgy.strip()])
            )):
                if headers is None:
                    w.writerow(["id", "itemID_1", "itemID_2"] + [n for n, r in res])
                    headers = True
                w.writerow([index, itemID_1, itemID_2] + [r for n, r in res])
                
                
#             for prindex, (itemID_1, itemID_2, imgsx, imgsy), results in itertools.imap(lambda args: process_images_arrays(args[0], args[1], mc, zipfiles), itertools.izip(pr.index, iter(pr[["itemID_1", "itemID_2", "images_array_x", "images_array_y"]].fillna("").values))):
#                 print prindex
#                 for imgx, imgy, res in results:
#                     if headers is None:
#                         w.writerow(["id", "itemID_1", "itemID_2"] + [n for n, r in res])
#                         headers = True
#                     w.writerow([prindex, itemID_1, itemID_2] + [r for n, r in res])
                    
#             for prindex, (itemID_1, itemID_2, imgsx, imgsy) in itertools.izip(pr.index, iter(pr[["itemID_1", "itemID_2", "images_array_x", "images_array_y"]].fillna("").values)):
#                 print prindex
#                 for imgx, imgy in itertools.product(str(imgsx).split(", "), str(imgsy).split(", ")):
#                     if all([imgx.strip(), imgy.strip()]):
#                         res = process_image_pair(imgx, imgy, mc, zipfiles)
#                         if headers is None:
#                             w.writerow(["id", "itemID_1", "itemID_2"] + [n for n, r in res])
#                             headers = True
#                         w.writerow([prindex, itemID_1, itemID_2] + [r for n, r in res])
        sys.stderr.write("DONE\n")
            
        
    


In [None]:
import json
import nltk
from sklearn.feature_extraction.text import CountVectorizer
analyzer = CountVectorizer(
    min_df=10
).build_analyzer()
analyzerstop = CountVectorizer(
    min_df=10,
    stop_words=u'ищ прода прод из по от под все вс прод для эт нов м² ког ту том ком ко ещ еш тоб тольк ве тем во нет а быт ты и тех не на но при чег один вы ем ел ед ег те та кто то же будьт всю вся себ всех ним одн им из будеш нам будет у могут я нас так наш мно всег всем будут них мне будуч все могл когд мен теб он буд об для кем бы тот мож мог в к уж о чем вот тог моч их может ил можеш эт сво был нег нем от чтоб мо ест соб до да вас е с по вам что за есл как сам котор этот мы'.split(' ')
).build_analyzer()
stemmer = nltk.stem.snowball.RussianStemmer()

translate = {
    u"а": "a",
    u"б": "b",
    u"в": "v",
    u"г": "g",
    u"д": "d",
    u"е": "e",
    u"ё": "e",
    u"ж": "z",
    u"з": "z",
    u"и": "i",
    u"й": "i",
    u"к": "k",
    u"л": "l",
    u"м": "m",
    u"н": "n",
    u"о": "o",
    u"п": "p",
    u"р": "r",
    u"с": "s",
    u"т": "t",
    u"у": "u",
    u"ф": "f",
    u"х": "h",
    u"ц": "c",
    u"ч": "c",
    u"ш": "s",
    u"щ": "s",
    u"ъ": "",
    u"ы": "i",
    u"ь": "",
    u"э": "e",
    u"ю": "u",
    u"я": "a",
}

def stemstring(x):
    return " ".join([stemmer.stem(_).encode('utf8') for _ in analyzer(x)])
    import re
    resplit = re.compile(r'[a-z0-9]*[0-9][a-z0-9]*')
    rechar = re.compile(r'[a-z]+')
    renum = re.compile(r'[0-9]+')
    res = []
    wordnot = False
    for _ in analyzer(x):
        if _ == u"не" or _ == u"ни":
            wordnot = True
            continue
        if wordnot:
            _ = u"не" + unicode(_)
            wordnot = False
        splitted = re.split(resplit, _)
        for __ in splitted:
            if len(__) < 2:
                continue
            __ = stemmer.stem(__)
            for sym in translate:
                __ = __.replace(sym, translate[sym])
            __ = re.sub(r'(?u)([^\W\d])\1+', r'\1', __)
#             for i in range(len(__)-3):
#                 res.append(__[i:i+4])
#             if len(__) < 4:
#                 res.append(__)
            res.append(__)
        for __ in re.findall(rechar, _):
            res.append(__)
        for __ in re.findall(renum, _):
            res.append(__)
    return unicode(" ".join(res)).encode("UTF-8")


def stemstringstop(x):
    return " ".join([stemmer.stem(_).encode('utf8') for _ in analyzerstop(x)])


def attrsJsonDecode(x):
    try:
        x = json.loads(x)
    except:
        return x
    res = {}
    for k, v in x.iteritems():
        if v.startswith('"') and '"=>"' in v:
            subres = {}
            V = json.loads('{' + v.replace('"=>"', '": "').replace('//"', '\\"').replace('////', '//') + '}')
            for k1 in sorted(V.keys()):
                if V[k1].startswith('"') and '"=>"' in v:
                    try:
                        w = json.loads('{' + V[k1].replace('"=>"', '": "').replace('//"', '\\"').replace('////', '//') + '}')
                    except:
                        print V[k1]
                        continue
                    for k2, v2 in w.iteritems():
                        if "-".join([k, k2]) not in subres:
                            subres["-".join([k, k2])] = {
                                k1: stemstring(v2),
                            }
                        else:
                            subres["-".join([k, k2])][k1] = stemstring(v2)
                else:
                    subres["-".join([k, k1])] = stemstring(V[k1])
            res.update(subres)
        else:
            res[k] = v
    return json.dumps(res)


def split_files(data_dir="."):
    import numpy as np
    import os
    import pandas as pd
    import sys
    from sklearn.externals.joblib import Parallel, delayed
    for fn in ["ItemInfo_train.csv", "ItemInfo_test.csv"]:
        c = None
        columns = "categoryID,parentCategoryID,title,description,images_array,attrsJSON,price,locationID,regionID,metroID,lat,lon,stoptitle,stopdescription".split(",")
        for colname in columns:
            colfn = "/".join([data_dir, fn + "_" + colname + ".csv"])
            if os.path.exists(colfn):
                continue
            sys.stderr.write(fn + ": " + colname)
            if c is None:
                c = pd.read_csv(data_dir + "/" + fn, index_col=0)
                c = c.join(pd.read_csv(data_dir + "/Category.csv", index_col=0), on="categoryID")
                c = c.join(pd.read_csv(data_dir + "/Location.csv", index_col=0), on="locationID")
            if colname in ["title", "description"]:
                c[colname] = Parallel(-1)(delayed(stemstring)(_) for _ in c[colname].fillna("").values)
                c[[colname]].to_csv(colfn)
            if colname in ["stoptitle", "stopdescription"]:
                c[colname] = Parallel(-1)(delayed(stemstringstop)(_) for _ in c[colname.replace("stop", "")].fillna("").values)
                c[[colname]].to_csv(colfn)
            elif colname in ["attrsJSON"]:
                c[colname] = Parallel(-1)(delayed(attrsJsonDecode)(_) for _ in c[colname].fillna("").values)
                c[[colname]].to_csv(colfn)
            else:
                c[[colname]].to_csv(colfn)
            sys.stderr.write(" DONE\n")

split_files("data")
process_images("data")

def printreturn(x):
    print x
    return x

def split_pair_files(data_dir="."):
    import os
    import numpy as np
    import pandas as pd
    import sys
    from sklearn.externals.joblib import Parallel, delayed
    for mode in ["train", "test"]:
        fn = "ItemPairs_%s.csv" % mode
        c = None
        imcmp = None
        columns = [
            ("imagesdist_Exact", lambda x: x[x["Exact"] > 0.95].index),
        ] + [
            ("imagesdist_Correlation_%d" % _, lambda x, t=_: x[x["Correlation"] > (0.01 * t)].index)
            for _ in range(99, 84, -1)
        ] + [
            ("imagesdist_Chi-Squared_%d" % _, lambda x, t=_: x[x["Chi-Squared"] < (0.01 * t)].index)
            for _ in range(1, 16)
        ] + [
            ("imagesdist_Intersection_%d" % _, lambda x, t=_: x[x["Intersection"] > (0.01 * t)].index)
            for _ in range(250, 150, -10)
        ] + [
            ("imagesdist_Hellinger_%d" % _, lambda x, t=_: x[x["Hellinger"] < (0.01 * t)].index)
            for _ in range(1, 11)
        ]
        for colname, fltr in columns:
            colfn = "/".join([data_dir, fn + "_" + colname + ".csv"])
            if os.path.exists(colfn):
                continue
            sys.stderr.write(fn + ": " + colname)
            
            if c is None:
                imfn = ("data/ItemInfo_%s.csv" % mode) + "_images_array.csv"
                imcolumn = pd.read_csv(imfn, index_col=0).fillna("").applymap(lambda x: [_ for _ in str(x).split(", ") if _ != ""])

                params = {}
                if mode == "test":
                    params["index_col"] = 0
                c = pd.read_csv(data_dir + "/" + fn, **params)
                c = c.merge(imcolumn[["images_array"]], how="left", left_on="itemID_1", right_index=True, sort=False)
                c = c.merge(imcolumn[["images_array"]], how="left", left_on="itemID_2", right_index=True, sort=False)
                del imcolumn

            if imcmp is None:
                imcmp = pd.read_csv("/".join([data_dir, "ItemPairs_%s.csv_Images.csv" % mode]), index_col=0)
            
            idlist = fltr(imcmp)
            
            def dist(x, y):
                z = max(len(x), len(y))
                if z == 0:
                    return 0.0
                m = {}
                for i in x:
                    for j in [_ for _ in y if _ not in m]:
                        if "|".join([i, j]) in idlist or "|".join([j, i]) in idlist:
                            m[j] = i
                return 1.0 * (z - len(m)) / z
            
            c[colname] = np.frompyfunc(dist, 2, 1)(c["images_array_x"].values, c["images_array_y"].values)
            
            c[[colname]].to_csv(colfn, index_label="id")
            c.drop(colname, axis=1, inplace=True)
            sys.stderr.write(" DONE\n")

        columns = [
            ('imagesdist_%s' % field, field)
            for field in (
                'Correlation',
                'Chi-Squared',
                'Intersection',
                'Hellinger',
                'scipy.braycurtis',
                'scipy.canberra',
                'scipy.chebyshev',
                'scipy.cityblock',
                'scipy.cosine',
                'scipy.euclidean',
                'scipy.hamming',
                'scipy.sqeuclidean',
                'skimage.compare_mse',
                'skimage.compare_ssim_3',
                'skimage.compare_ssim_5',
                'skimage.compare_ssim_7',
                'skimage.compare_mse.adapt',
                'skimage.compare_ssim_3.adapt',
                'skimage.compare_ssim_5.adapt',
                'skimage.compare_ssim_7.adapt',
            )
        ]
        
        if any((
                not os.path.exists("/".join([data_dir, "%s_%s_%s.csv" % (fn, column[0], func)]))
                for column in columns
                for func in ('min', 'mean', 'max')
            )):

            for colname, field in columns:
            
                sys.stderr.write(fn + ": " + field + ": ")
                imcmp = pd.read_csv("/".join([data_dir, "ItemPairs_%s.csv_ImgCmp.csv" % mode]), usecols=['id', field])

                for func in 'min', 'mean', 'max':
                    sys.stderr.write(func + " ")
                    groupped = getattr(imcmp.groupby("id"), func)()
                    groupped['_'.join(["imagesdist", field, func])] = groupped[field]
                    groupped[['_'.join(["imagesdist", field, func])]].to_csv("/".join([data_dir, "%s_%s_%s.csv" % (fn, colname, func)]))
                sys.stderr.write("  DONE\n")

split_pair_files("data")



In [None]:

import itertools
from scipy import sparse as sp

def bitwise_op(x, y, op):
    assert x.shape == y.shape
    x = x.astype(bool).astype(int)
    y = y.astype(bool).astype(int)
    if op == "and":
        z = (x+y) > 1
    elif op == "or":
        z = (x+y) > 0
    elif op == "xor":
        z = ((x+y) == 1)
    else:
        raise Exception("Unknown operation `%s`" % op)
    return z.astype(float).tocsr()


def preprocess_price(X):
    import math
    import numpy as np
    def round_significant(x, digits):
        return round(x, -int(math.log10(abs(x)) - digits + 1)) if abs(x) > 0.0 else 0.0
    X[X["price"] > 3000000] = 3000000
    X["price"] = X["price"].map(lambda x: round_significant(x, 1) if np.isfinite(x) else x)
    X["price"] = X["price"].map(lambda x: x if not np.isfinite(x) else 100.0 if x<100 else x)
    X["price"] = X["price"].fillna(-1.0)
    return X


def preprocess_pair_price(X):
    X.loc[X["price_x"] == -1.0, "price_x"] = X["price_y"]
    X.loc[X["price_y"] == -1.0, "price_y"] = X["price_x"]
    return X


def getX(pr, data_dir=".", filename="ItemInfo_train.csv", columns=None):
    import numpy as np
    import pandas as pd
    X = pr[["itemID_1", "itemID_2"]].copy()
    for colname in columns:
        colfn = "/".join([data_dir, filename + "_" + colname + ".csv"])
        column = pd.read_csv(colfn, index_col=0)
        if colname == "price":
            column = preprocess_price(column)
        X = X.merge(column[[colname]], how="left", left_on="itemID_1", right_index=True, sort=False)
        X = X.merge(column[[colname]], how="left", left_on="itemID_2", right_index=True, sort=False)
        if colname == "price":
            X = preprocess_pair_price(X)
        del column
    return X


In [None]:

import datetime
import gc
import gzip
import editdistance
import itertools
import os
import numpy as np
import pandas as pd

from pyxdameraulevenshtein import damerau_levenshtein_distance as dld
from scipy import sparse as sp
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.externals.joblib import Parallel, cpu_count, delayed
from sklearn.externals.joblib import Memory, dump, load
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics.pairwise import distance_metrics
from sklearn.metrics import pairwise_distances
from sklearn.svm import LinearSVC
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.utils.metaestimators import if_delegate_has_method
from types import MethodType
from xgboost import XGBClassifier

memory = Memory(cachedir=os.getenv("CACHEDIR", "data/avito-cache"))


@memory.cache
def getparams(est, *args, **kwargs):
    e = clone(est)
    e.fit(*args, **kwargs)
    return {k: v for k, v in e.__dict__.iteritems() if re.search(r'[^_]_$', k)}

def cached(obj):
    obj.transform = memory.cache(obj.transform)
    def fit(self, *args, **kwargs):
        print "fit"
        d = getparams(self, *args, **kwargs)
        self.__dict__.update(d)
        return self
    setattr(obj, "fit", MethodType(fit, obj, obj.__class__))
#     obj.fit = MethodType(fit, obj, obj.__class__)
    return obj


class Cached(BaseEstimator):
    
    @classmethod
    def baseinit(cls):
        import atexit
        import os
        import uuid
        cls.base_dir = os.getenv("CACHEDIR", "data/avito-cache") + "/Cached-" + str(uuid.uuid4())
        os.mkdir(cls.base_dir)
        atexit.register(cls.basedel)
    
    @classmethod
    def basedel(cls):
        import shutil
        try:
            shutil.rmtree(cls.base_dir)
        except:
            pass

    def __init__(self, est):
        self.est = est
    
    def _load(self):
        from sklearn.externals.joblib import load
        self.est_ = load(self.__class__.base_dir + "/" + self.uuid_)

    def _dump(self):
        import uuid
        from sklearn.externals.joblib import dump
        self.uuid_ = "joblib-" + str(uuid.uuid4()) + ".est"
        dump(self.est_, self.__class__.base_dir + "/" + self.uuid_)
        del self.est_
    
    def fit(self, X, y=None):
        from sklearn.base import clone
        self.est_ = clone(self.est)
        if y is None:
            self.est_ = memory.cache(self.est_.fit)(X)
        else:
            self.est_ = memory.cache(self.est_.fit)(X, y)
        self._dump()
        return self

    @if_delegate_has_method(delegate="est")
    def transform(self, X):
        self._load()
        res = memory.cache(self.est_.transform)(X)
        del self.est_
        return res

    @if_delegate_has_method(delegate="est")
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

    @if_delegate_has_method(delegate="est")
    def predict(self, X):
        self._load()
        res = memory.cache(self.est_.predict)(X)
        del self.est_
        return res

    @if_delegate_has_method(delegate="est")
    def predict_proba(self, X):
        self._load()
        res = memory.cache(self.est_.predict_proba)(X)
        del self.est_
        return res

    @if_delegate_has_method(delegate="est")
    def predict_log_proba(self, X):
        self._load()
        res = memory.cache(self.est_.predict_log_proba)(X)
        del self.est_
        return res
Cached.baseinit()


class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, include=None, exclude=None):
        self.include = include
        self.exclude = exclude

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.include is not None:
            return X[:,self.include]
        elif self.exclude is not None:
            return X[:, [_ for _ in xrange(X.shape[1]) if _ not in self.exclude]]
        else:
            return X

class JSONAttr(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(JSONAttr, self).__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return sp.hstack(
            [
                FeatureHasher(input_type="string").fit_transform(
                    (
                        ["___".join(__.split(": ")) for __ in _.strip("{}").split(", ")]
                        for _ in iter(X[:,i]))
                ).astype(float)
                for i in range(X.shape[1])]
        )

class SwitchTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, est, postaction=None, preaction=None):
        super(SwitchTransformer, self).__init__()
        self.est = est
        self.postaction = postaction
        self.preaction = preaction

    def fit(self, X, y=None):
        print X.shape
        self.est.fit(self._preprocess(X.reshape(-1,1)))
        return self

    def transform(self, X):
        Xs = self._postprocess([self.est.transform(self._preprocess(X[:,[i]])) for i in xrange(X.shape[1])])
        print Xs.shape
        return Xs

    def _preprocess(self, X):
        if self.preaction == "ravel":
            return X.ravel()
        return self.preaction(X) if self.preaction is not None else X

    def _postprocess(self, Xs):
        if self.postaction == "diff":
            Xs = Xs[0] - Xs[1]
        elif self.postaction in ("and", "or", "xor"):
            Xs = bitwise_op(Xs[0], Xs[1], self.postaction)
        elif isinstance(self.postaction, list):
            Xs = [
                bitwise_op(Xs[0], Xs[1], _)
                for _ in self.postaction
            ]
            if any(sp.issparse(f) for f in Xs):
                Xs = sp.hstack(Xs).tocsr()
            else:
                Xs = np.hstack(Xs)
        elif self.postaction is not None:
            Xs = self.postaction(Xs)
        else:
            if any(sp.issparse(f) for f in Xs):
                Xs = sp.hstack(Xs).tocsr()
            else:
                Xs = np.hstack(Xs)
        return Xs
    
class SupervisedDecision(BaseEstimator, TransformerMixin):
    def __init__(self, est, method=None):
        super(SupervisedDecision, self).__init__()
        self.est = est
        self.method = method

    def fit(self, X, y=None):
        self.est.fit(X, y)
        return self

    def transform(self, X):
        if self.method is None:
            res = self.est.predict_proba(X)[:,[1]]
        else:
            res = getattr(self.est, self.method)(X)
        if len(res.shape) == 1:
            res = res.reshape(-1,1)
        return res

def _estimator_fit(est, X, y=None):
    return est.fit(X, y)
    
def _estimator_transform(est, X):
    return est.transform(X)
    
def _estimator_predict_proba(est, X):
    return est.predict_proba(X)[:,1]

class FilteredSupervisedDecision(BaseEstimator, TransformerMixin):
    def __init__(self, est, switch, val, xcols):
        super(FilteredSupervisedDecision, self).__init__()
        self.est = est
        self.switch = switch
        self.val = val
        self.xcols = xcols
    
    def fit(self, X, y=None):
        mask = X[:,self.switch]==self.val
        self.est.fit(X[mask][:,self.xcols], y[mask])
        return self
    
    def transform(self, X):
        res = sp.lil_matrix((X.shape[0], 1), dtype=float)
        mask = X[:,self.switch]==self.val
        if hasattr(self.est, "predict_proba"):
            res[mask,0] = self.est.predict_proba(X[mask][:,self.xcols])[:,[1]]
        else:
            estres = self.est.transform(X[mask][:,self.xcols])
            res = sp.lil_matrix((X.shape[0], estres.shape[1]), dtype=float)
            res[mask] = estres
        return res.tocsr()

class MegaSupervisedDecision(BaseEstimator, TransformerMixin):
    def __init__(self, est, switch, xcols, n_jobs=-1):
        super(MegaSupervisedDecision, self).__init__()
        self.est = est
        self.switch = switch
        self.xcols = xcols
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        self.uniqs_ = np.unique(X[:,self.switch].ravel())
        print self.uniqs_
        self.ests_ = {
            u: est for u, est in itertools.izip(self.uniqs_, Parallel(self.n_jobs)(
                    (
                        delayed(_estimator_fit)(
                            clone(self.est),
                            X[X[:,self.switch]==u][:,self.xcols],
                            y[X[:,self.switch]==u])
                        for u in self.uniqs_
                    )
                ))
        }
        return self

    def transform(self, X):
        xnew = np.zeros(X.shape[0], dtype=float)
        for u, proba in itertools.izip(self.uniqs_, Parallel(self.n_jobs)(
                (
                    delayed(_estimator_predict_proba)(
                        self.ests_[u],
                        X[X[:,self.switch]==u][:,self.xcols])
                    for u in self.uniqs_
                )
            )):
            test = X[:,self.switch] == u
            xnew[test] = proba
        print xnew
        return xnew.reshape(-1,1)

class EqNonEqBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, binarize=True):
        super(EqNonEqBinarizer, self).__init__()
        self.binarize = binarize

    def fit(self, X, y=None):
        if self.binarize:
            self.est = LabelBinarizer(sparse_output=True)
            self.est.fit(X[:,:2].astype(float).ravel())
        return self

    def transform(self, X):
        xnew = np.where(np.array([X[:,0]==X[:,1],X[:,0]!=X[:,1],X[:,0]!=X[:,1]]), np.array([X[:,0],X[:,0],X[:,1]]), 0).T.astype(float)
        if self.binarize:
            return sp.hstack((
                    self.est.transform(xnew[:,0].ravel()),
                    self.est.transform(xnew[:,1].ravel()) + self.est.transform(xnew[:,2].ravel()),
                ))
        else:
            return xnew

    @staticmethod
    def test():
        ene = EqNonEqBinarizer()
        assert (ene.fit_transform(np.array(
                [
                    [1,1],
                    [2,1],
                    [1,2],
                    [3,2],
                    [3,3],
                ])).toarray() == np.array([
                [
                    [1,0,0,0,0,0],
                    [0,0,0,1,1,0],
                    [0,0,0,1,1,0],
                    [0,0,0,0,1,1],
                    [0,0,1,0,0,0],
                ]
            ])).all()
        del ene
EqNonEqBinarizer.test()


def block_calc_distances(X, metrics):
    sparseok = distance_metrics().keys()
    sparsemetrics = [_ for _ in metrics if _ in sparseok]
    densemetrics = [_ for _ in metrics if _ not in sparseok]
    
    length = X.shape[-1]/2
    Xs = []
    for row in iter(X):
        if len(densemetrics):
            rowdense = row.todense()
        Xs.append(np.array(
            [pairwise_distances(row[:, :length], row[:, length:], metric) for metric in sparsemetrics] +
            [pairwise_distances(rowdense[:, :length], rowdense[:, length:], metric) for metric in densemetrics]
        ).ravel())
    return np.array(Xs)

class DistanceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, metrics, n_jobs=-1):
        super(DistanceTransformer, self).__init__()
        self.metrics = metrics
        if n_jobs == -1:
            n_jobs = cpu_count()
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        size = X.shape[0]
        chsz, extra = divmod(size, self.n_jobs)
        if extra:
            chsz += 1
        if size <= self.n_jobs:
            self.n_jobs = size
            chsz = 1
        return np.vstack(
            Parallel(self.n_jobs)(delayed(block_calc_distances)(X[i:min(i+chsz, size)], self.metrics) for i in xrange(0, size, chsz))
        )

    @staticmethod
    def test():
        dist = Pipeline([
                    ('transform', SwitchTransformer(est=TfidfVectorizer(), preaction=lambda X: X.ravel())),
                    ('dists', DistanceTransformer(metrics=["cosine"], n_jobs=1)),
                ])
        assert (np.round(dist.fit_transform(np.array([
                ["nokia 3310", "nokia 3310"],
                ["samsung A5", "samsung A7"],
                ["audi a4", "bmw a4"],
                ["lada priora", "niva chevrolet"],
            ])), 2) == np.array([
                [
                        [0.0],
                        [0.59],
                        [0.59],
                        [1.0],
                ]
            ])).all()
        del dist
DistanceTransformer.test()


class ConstImputer(BaseEstimator, TransformerMixin):
    def __init__(self, fillvalue):
        super(ConstImputer, self).__init__()
        self.fillvalue = fillvalue

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[X.astype(str)=='nan'] = self.fillvalue
        return X
    
    @staticmethod
    def test():
        tr = ConstImputer("fillvalue")
        assert (tr.fit_transform(np.array([
                ["nokia 3310"],
                [np.nan],
                ["lada priora"],
            ]).astype(object)) == np.array([
                [
                    ["nokia 3310"],
                    ["fillvalue"],
                    ["lada priora"],
                ]
            ])).all()
        del tr
ConstImputer.test()


class IdFreq(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1):
        super(IdFreq, self).__init__()
        self.threshold = threshold
    
    def fit(self, X, y=None):
        ids, counts = np.unique(X.ravel(), return_counts=True)
        self.whitelist = ids[counts >= self.threshold]
        self.newid = ids[counts < self.threshold][0] if len(ids[counts < self.threshold]) else None
        return self
    
    def transform(self, X):
        ids = np.unique(X.ravel())
        for i in set(ids) ^ set(self.whitelist):
            X[X==i] = self.newid
        return X
    
    @staticmethod
    def test():
        tr = IdFreq(2)
        assert (tr.fit_transform(np.array([
                [1],
                [2],
                [1],
                [3],
            ])) == np.array([
                [
                    [1],
                    [2],
                    [1],
                    [2],
                ]
            ])).all()
IdFreq.test()

class SumClassifier(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.sum(axis=1)

class CoordDist(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.sqrt((X[:,0] - X[:,1])**2 + (X[:,2] - X[:,3])**2).reshape(-1, 1)

class LocationDist(BaseEstimator, TransformerMixin):
    def __init__(self, thresholds=None):
        self.thresholds = thresholds
        pass
    
    def fit(self, X, y=None):
        xnew = np.array([X[:,[0,1]].ravel(), X[:,[2,3]].ravel(), X[:,[4,5]].ravel()]).T
        loclist = np.unique(xnew[:,0])
        self.locs_ = {}
        for lid in loclist:
            mask = xnew[:,0] == lid
            self.locs_[lid] = xnew[mask,1:].mean(axis=0)
            pass
        return self
    
    def transform(self, X):
        xflat = np.array([X[:,[0,1]].ravel(), X[:,[2,3]].ravel(), X[:,[4,5]].ravel()]).T
        xnew = np.zeros((xflat.shape[0], 1), dtype=float)
        locs = set(np.unique(xflat[:,0]))
        for lid in locs ^ set(self.locs_.keys()):
            mask = xflat[:,0] == lid
            xnew[mask,0] = np.nan
        for lid in self.locs_.keys():
            mask = xflat[:,0] == lid
            xnew[mask,0] = np.sqrt(((xflat[mask,1:]-self.locs_[lid])**2).sum(axis=1))

        result = sp.lil_matrix((xnew.shape[0], 2+len(self.thresholds)), dtype=int)
        
        mask = np.where(~np.isfinite(xnew))[0]
        result[mask, np.array([0]).repeat(mask.shape[0])] = 1
        
        mask = np.where(xnew<self.thresholds[0])[0]
        result[mask, np.array([1]).repeat(mask.shape[0])] = 1
        for i in xrange(len(self.thresholds)-1):
            mask = np.where(np.logical_and(xnew >= self.thresholds[i], xnew < self.thresholds[i+1]))[0]
            result[mask, np.array([i+3]).repeat(mask.shape[0])] = 1
        
        mask = np.where(xnew>self.thresholds[-1])[0]
        result[mask, np.array([2]).repeat(mask.shape[0])] = 1
        
        result = result[::2] + result[1::2]
        return result.tocsr()

    @staticmethod
    def test():
        tr = LocationDist([10, 13, 15])
        assert (tr.fit_transform(np.array([
                        [1, 2, 10, 10, 20, 20],
                        [2, 3, 10, 20, 20, 20],
                        [3, 1, 0, 0, 0, 0]
            ])) == np.array([
                    [0, 1, 0, 1, 0],
                    [0, 1, 0, 0, 1],
                    [0, 0, 0, 1, 1],
            ])).all()
        del tr
LocationDist.test()


def json2dict(X):
    def _inner(x):
        res = {}
        for k, v in json.loads(x).iteritems():
            if isinstance(v, dict):
                res[k] = v
            else:
                res[k+"__"] = "1"
                res[k+"___"+v] = "1"
        return res
    return np.frompyfunc(_inner, 1, 1)(X)

class JsonToDict(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=-1):
        self.n_jobs = n_jobs
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.n_jobs == -1:
            self.n_jobs = cpu_count()
        size = X.shape[0]
        chsz, extra = divmod(size, self.n_jobs)
        if extra:
            chsz += 1
        if size <= self.n_jobs:
            self.n_jobs = max(size, 1)
            chsz = 1
        return np.vstack(
            Parallel(self.n_jobs)(delayed(json2dict)(X[i:min(i+chsz, size)]) for i in xrange(0, size, chsz))
        )


def jsonattrstrf_fit(X):
    uniqs = np.array([])
    iX = iter(X)
    while True:
        samples = np.array(list(itertools.islice(iX, 500))).ravel()
        if samples.shape[0] == 0:
            break
        uniqs = np.unique(np.hstack(
                (
                    uniqs,
                    np.array([
                        k
                        for x in samples
                        for k, v in x.iteritems()
                    ])
                )
            ))
    return uniqs
    return np.unique(np.array([
                k
                for x in X.ravel()
                for k, v in x.iteritems()
            ]))
    def _inner(x):
        return np.array([k for k, v in x.iteritems()])
    return np.unique(np.hstack(np.frompyfunc(_inner, 1, 1)(X)))

def jsonattrstrf_transform(X, keylist):
    a = CountVectorizer().build_analyzer()
    keys = {k: i for i, k in enumerate(keylist)}
    keyslen = len(keys)
    res = sp.lil_matrix((X.shape[0], 2*keyslen), dtype=float)
    for i in xrange(X.shape[0]):
        for k in set(X[i,0].keys()) | set(X[i,1].keys()):
            if k not in keys:
                continue
            ki = keys[k]
            if k not in X[i,0] or k not in X[i,1]:
                res[i,keyslen + ki] = 1.0
            else:
                v1 = X[i,0][k]
                v2 = X[i,1][k]
                if not isinstance(v1, dict) and not isinstance(v2, dict):
                    res[i,ki] = 1.0
                elif isinstance(v1, dict) and isinstance(v2, dict):
                    score = np.array([]).astype(float)
                    for k1 in set(v1.keys()) | set(v2.keys()):
                        if k1 not in v1 or k1 not in v2:
                            score = np.hstack((score, np.array([0.0])))
                        else:
                            # jaccard
                            j1 = set(a(v1[k1]))
                            j2 = set(a(v2[k1]))
                            if len(j1 | j2) == 0:
                                score = np.hstack((score, np.array([1.0])))
                            else:
                                score = np.hstack((score, np.array([1.0 * len(j1 & j2) / len(j1 | j2)])))
                    res[i,ki] = score.mean()
                else:
                    res[i,keyslen + ki] = 1.0
    return res.tocsr()

class JsonAttrsTrf(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=-1):
        self.n_jobs = n_jobs
        pass
    
    def fit(self, X, y=None):
        if self.n_jobs == -1:
            self.n_jobs = cpu_count()
        size = X.shape[0]
        chsz, extra = divmod(size, self.n_jobs)
        if extra:
            chsz += 1
        if size <= self.n_jobs:
            self.n_jobs = max(size, 1)
            chsz = 1
        keys = np.unique(np.hstack(
            Parallel(self.n_jobs)(delayed(jsonattrstrf_fit)(X[i:min(i+chsz, size)]) for i in xrange(0, size, chsz))
        ))
        self.keys_ = keys
        return self
    
    def transform(self, X):
        size = X.shape[0]
        chsz, extra = divmod(size, self.n_jobs)
        if extra:
            chsz += 1
        if size <= self.n_jobs:
            self.n_jobs = max(size, 1)
            chsz = 1
        result = sp.vstack(
            Parallel(self.n_jobs)(delayed(jsonattrstrf_transform)(X[i:min(i+chsz, size)], self.keys_) for i in xrange(0, size, chsz))
        )
        print result.shape
        return result

    @staticmethod
    def test():
        tr = JsonAttrsTrf(n_jobs=1)
        assert (np.round(tr.fit_transform(np.array([
                    [{"something": 1}, {"something": 1}],
                    [{"something": 1}, {"otherthing": 1}],
                    [{"somedict": {"0": "my car", "1": "2016-01-02"}}, {"somedict": {"0": "your car", "1": "2016-01-02"}}],
            ])).toarray(), 2) == np.array([
                    [ 0., 0.,   1., 0., 0., 0. ],
                    [ 0., 0.,   0., 1., 0., 1. ],
                    [ 0., 0.67, 0., 0., 0., 0. ],
            ])).all()
        del tr
JsonAttrsTrf.test()


class IdRecoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(IdRecoder, self).__init__()
        pass
    
    def fit(self, X, y=None):
        uniqs = sorted(itertools.izip(*np.unique(X.ravel(), return_counts=True)), key=lambda x: x[1], reverse=True)
        self.uniq_ = uniqs
        return self
    
    def transform(self, X):
        res = np.zeros(X.shape, dtype=float)
        res[:] = np.nan
        for i, (uid, cnt) in enumerate(self.uniq_):
            res[X==uid] = i
        res[~np.isfinite(res)] = len(self.uniq_)
        return res
    
    @staticmethod
    def test():
        tr = IdRecoder()
        assert (tr.fit_transform(np.array([
                    [10],
                    [11],
                    [20],
                    [10],
                    [11],
                    [11],
                ])) == np.array([
                    [1.],
                    [0.],
                    [2.],
                    [1.],
                    [0.],
                    [0.],
                ])).all()
IdRecoder.test()
        



In [None]:

class CatProba(BaseEstimator, TransformerMixin):
    def __init__(self, only_sum=False):
        self.only_sum = only_sum
        pass
    
    def fit(self, X, y=None):
        ids = np.unique(X.ravel())
        self.probabilities_ = {}
        for cat in ids:
            mask = (X==cat).any(axis=1)
            self.probabilities_[cat] = 1.0 * y[mask].sum() / len(y[mask])
        return self
    
    def transform(self, X):
        result = sp.lil_matrix((X.shape[0], len(self.probabilities_.keys())+1), dtype=float)
        for i, cat in enumerate(self.probabilities_.keys()):
            mask = np.where((X==cat).any(axis=1))[0]
            result[mask, np.array([i]).repeat(mask.shape[0])] = self.probabilities_[cat]
        result[:,len(self.probabilities_.keys())] = result.sum(axis=1)
        if self.only_sum:
            return result[:,-1].tocsr()
        else:
            return result.tocsr()
    
#     def fit_transform(self, X, y=None):
#         ids = np.unique(X.ravel())
#         self.probabilities_ = {}
#         result = sp.lil_matrix((X.shape[0], 0), dtype=float)
#         for cat in ids:
#             mask = np.where((X==cat).any(axis=1))[0]
#             self.probabilities_[cat] = 1.0 * y[mask].sum() / len(y[mask])
#             catres = sp.lil_matrix((X.shape[0], 1), dtype=float)
#             catres[mask, np.array([0]).repeat(mask.shape[0])] = self.probabilities_[cat]
#             result = sp.vstack([
#                     result,
#                     catres
#                 ])
#         result = sp.vstack([
#                 result,
#                 result.sum(axis=1)
#             ])
#         return result
        
    @staticmethod
    def test():
        tr = CatProba()
        assert (tr.fit_transform(np.array([
                        [1, 1],
                        [2, 2],
                        [1, 1],
                        [3, 3],
            ]), np.array([
                        [1],
                        [1],
                        [0],
                        [0],
                    ])).toarray() == np.array([
                    [ 0.5,  0. ,  0. ,  0.5],
                    [ 0. ,  1. ,  0. ,  1. ],
                    [ 0.5,  0. ,  0. ,  0.5],
                    [ 0. ,  0. ,  0. ,  0. ],
                ])).all()
        del tr
CatProba.test()


class MetaCategoryEstimator(BaseEstimator, ClassifierMixin):
    
    catfield = "categoryID"
    pcatfield = "parentCategoryID"
    
    def __init__(self, itrain, itest, category, parent, common):
        super(MetaCategoryEstimator, self).__init__()
        self.itrain = itrain
        self.itest = itest
        self.category = category
        self.parent = parent
        self.common = common
        cats = pd.read_csv("data/Category.csv", index_col=0)
        self.pcat_by_cat_ = {k: v for k, v in cats["parentCategoryID"].iteritems()}
        self.cats_by_pcat_ = {}
        for k, v in self.pcat_by_cat_.iteritems():
            self.cats_by_pcat_[v] = self.cats_by_pcat_.get(v, []) + [k]
        pass
    
    def parent_fit(self, X, y=None):
        from sklearn.cross_validation import train_test_split
        from sklearn.metrics import classification_report
        from sklearn.metrics import roc_auc_score
        
        cats = pd.read_csv("data/ItemInfo_train.csv_%s.csv" % self.__class__.catfield, index_col=0)
        pcats = pd.read_csv("data/ItemInfo_train.csv_%s.csv" % self.__class__.pcatfield, index_col=0)
        self.pcats_ = list(np.unique(pcats[self.__class__.pcatfield].values.ravel()))
        pr = pd.read_csv("data/ItemPairs_train.csv").loc[self.itrain]
        pr = pd.merge(pr, cats, left_on="itemID_1", right_index=True, how="inner", sort=False)
        pr = pd.merge(pr, pcats, left_on="itemID_1", right_index=True, how="inner", sort=False)
        del cats
        del pcats
        
        self.pcat_itrain_ = []
        self.pcat_itest_ = []
        for pcatid in self.pcats_:
            catids = self.cats_by_pcat_[pcatid]
            pcatindices = pd.Index(np.hstack([
                        cat_itrain
                        for catid, cat_itrain in itertools.izip(self.cats_, self.cat_itrain_)
                        if catid in catids
                    ]))
            itrain, itest = train_test_split(pcatindices, test_size=0.05, random_state=42, stratify=pr.loc[pcatindices, "isDuplicate"].values)
            self.pcat_itrain_.append(itrain)
            self.pcat_itest_.append(itest)

        for pcatid, pcatindices, itest in itertools.izip(self.pcats_, self.pcat_itrain_, self.pcat_itest_):
            est = None
            transformers_list = []
            for columns, transformer in self.parent:
                if columns == []:
                    est = clone(transformer)
                    break
                transformers_list.append((columns, transformer))
                x = _fit_transform("train", pcatindices, columns, transformer)
                del x
                _transform("train", pcatindices, itest, columns, transformer)
            if est is not None:
                Xs = [_fit_transform("train", pcatindices, columns, transformer) for columns, transformer in transformers_list]
                if any(sp.issparse(f) for f in Xs):
                    Xs = sp.hstack([_ for _ in Xs if _.shape[-1]]).tocsr()
                else:
                    Xs = np.hstack([_ for _ in Xs if _.shape[-1]])
                print "PCATID:", pcatid, Xs.shape
                est.fit(Xs, pr.loc[pcatindices, "isDuplicate"].values)
                del Xs
                
                Xs = [_transform("train", pcatindices, itest, columns, transformer) for columns, transformer in transformers_list]
                if any(sp.issparse(f) for f in Xs):
                    Xs = sp.hstack([_ for _ in Xs if _.shape[-1]]).tocsr()
                else:
                    Xs = np.hstack([_ for _ in Xs if _.shape[-1]])
                print "SCORE:", roc_auc_score(pr.loc[itest, "isDuplicate"].values, est.predict_proba(Xs)[:,1])
                print classification_report(pr.loc[itest, "isDuplicate"].values, est.predict(Xs))
                del Xs
                dump(est, os.getenv("CACHEDIR", "data/avito-cache/meta-pcat-%d.est" % pcatid))
                del est
    
    def parent_predict_proba(self, mode, catid, indices):
        pdparams = {}
        if mode == "test":
            pdparams["index_col"] = 0
        pcats = pd.read_csv("data/ItemInfo_%s.csv_%s.csv" % (mode, self.__class__.pcatfield), index_col=0)
        pr = pd.read_csv("data/ItemPairs_%s.csv" % mode, **pdparams).loc[indices]
        pr = pd.merge(pr, pcats, left_on="itemID_1", right_index=True, how="inner", sort=False)
        del pcats
        
        result = pd.DataFrame([], index=indices)
        result["proba"] = 0.0
        result["pred"] = 0.0
        
        pcatid = self.pcat_by_cat_[catid]
        pcat_itrain = self.pcat_itrain_[self.pcats_.index(pcatid)]
        
        print indices
        est = None
        transformers_list = []
        for columns, transformer in self.parent:
            if columns == []:
                est = clone(transformer)
                break
            transformers_list.append((columns, transformer))
        if est is not None:
            Xs = [_transform(mode, pcat_itrain, indices, columns, transformer) for columns, transformer in transformers_list]
            if any(sp.issparse(f) for f in Xs):
                Xs = sp.hstack([_ for _ in Xs if _.shape[-1]]).tocsr()
            else:
                Xs = np.hstack([_ for _ in Xs if _.shape[-1]])
            print "PCATID:", pcatid, Xs.shape
            est = load(os.getenv("CACHEDIR", "data/avito-cache/meta-pcat-%d.est" % pcatid))
            result.loc[indices, "proba"] = est.predict_proba(Xs)[:,1]
            result.loc[indices, "pred"] = est.predict(Xs)
            del Xs
            del est

        result["proba0"] = 1.0 - result["proba"]
        print result
        return result[["proba0", "proba"]].values
    
    def fit(self, X, y=None):
        from sklearn.cross_validation import train_test_split
        from sklearn.metrics import classification_report
        from sklearn.metrics import roc_auc_score
        
        assert (X.ravel() == self.itrain.values).all()
        
        cats = pd.read_csv("data/ItemInfo_train.csv_%s.csv" % self.__class__.catfield, index_col=0)
        self.cats_ = list(np.unique(cats[self.__class__.catfield].values.ravel()))
        pr = pd.read_csv("data/ItemPairs_train.csv").loc[self.itrain]
        pr = pd.merge(pr, cats, left_on="itemID_1", right_index=True, how="inner", sort=False)
        del cats
        
        self.cat_itrain_ = []
        self.cat_itest_ = []
        for catid in self.cats_:
            catindices = pr[pr[self.__class__.catfield] == catid].index
            itrain, itest = train_test_split(catindices, test_size=0.05, random_state=42, stratify=pr.loc[catindices, "isDuplicate"].values)
            self.cat_itrain_.append(itrain)
            self.cat_itest_.append(itest)

#         self.common_fit(X, y)
        self.parent_fit(X, y)

        for catid, catindices, itest in itertools.izip(self.cats_, self.cat_itrain_, self.cat_itest_):
            est = None
            transformers_list = []
            for columns, transformer in self.category:
                if columns == []:
                    est = clone(transformer)
                    break
                transformers_list.append((columns, transformer))
                x = _fit_transform("train", catindices, columns, transformer)
                del x
                _transform("train", catindices, itest, columns, transformer)
            if est is not None:
                Xs = [_fit_transform("train", catindices, columns, transformer) for columns, transformer in transformers_list]
                Xs += [self.parent_predict_proba("train", catid, catindices)]
                if any(sp.issparse(f) for f in Xs):
                    Xs = sp.hstack([_ for _ in Xs if _.shape[-1]]).tocsr()
                else:
                    Xs = np.hstack([_ for _ in Xs if _.shape[-1]])
                print "CATID:", catid, Xs.shape
                est.fit(Xs, pr.loc[catindices, "isDuplicate"].values)
                del Xs
                
                Xs = [_transform("train", catindices, itest, columns, transformer) for columns, transformer in transformers_list]
                Xs += [self.parent_predict_proba("train", catid, itest)]
                if any(sp.issparse(f) for f in Xs):
                    Xs = sp.hstack([_ for _ in Xs if _.shape[-1]]).tocsr()
                else:
                    Xs = np.hstack([_ for _ in Xs if _.shape[-1]])
                print "SCORE:", roc_auc_score(pr.loc[itest, "isDuplicate"].values, est.predict_proba(Xs)[:,1])
                print classification_report(pr.loc[itest, "isDuplicate"].values, est.predict(Xs))
                del Xs
                dump(est, os.getenv("CACHEDIR", "data/avito-cache/meta-cat-%d.est" % catid))
                del est
        return self
    
    def predict_proba(self, X):
        cats = pd.read_csv("data/ItemInfo_test.csv_%s.csv" % self.__class__.catfield, index_col=0)
        pr = pd.read_csv("data/ItemPairs_test.csv", index_col=0).loc[self.itest]
        pr = pd.merge(pr, cats, left_on="itemID_1", right_index=True, how="inner", sort=False)
        del cats
        
        result = pd.DataFrame([], index=self.itest)
        result["proba"] = 0.0
        result["pred"] = 0.0
        for catid, cat_itrain in itertools.izip(self.cats_, self.cat_itrain_):
            catindices = pr[pr[self.__class__.catfield] == catid].index
            print catindices
            est = None
            transformers_list = []
            for columns, transformer in self.category:
                if columns == []:
                    est = clone(transformer)
                    break
                transformers_list.append((columns, transformer))
            if est is not None:
                Xs = [_transform("test", cat_itrain, catindices, columns, transformer) for columns, transformer in transformers_list]
                Xs += [self.parent_predict_proba("test", catid, catindices)]
                if any(sp.issparse(f) for f in Xs):
                    Xs = sp.hstack([_ for _ in Xs if _.shape[-1]]).tocsr()
                else:
                    Xs = np.hstack([_ for _ in Xs if _.shape[-1]])
                print "CATID:", catid, Xs.shape
                est = load(os.getenv("CACHEDIR", "data/avito-cache/meta-cat-%d.est" % catid))
                result.loc[catindices, "proba"] = est.predict_proba(Xs)[:,1]
                result.loc[catindices, "pred"] = est.predict(Xs)
                del Xs
                del est
        result["proba0"] = 1.0 - result["proba"]
        print result
        return result[["proba0", "proba"]].values

class TitleDist(BaseEstimator, TransformerMixin):
    def __init__(self, analyzer=None):
        super(TitleDist, self).__init__()
        if analyzer is None:
            analyzer = CountVectorizer()
        self.analyzer = analyzer
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        a = self.analyzer.build_analyzer()
        def _inner(x, y):
            setx = set(a(x))
            sety = set(a(y))
            if len(setx | sety) == 0:
                return 0.0, 0.0
            return 1.0 * len(setx & sety) / len(setx | sety), 1.0 * len(setx ^ sety) / len(setx | sety)
        return np.vstack(np.frompyfunc(_inner, 2, 2)(X[:,0], X[:,1])).T


def _title_edit_dist_transform(x, y):
    return [
        editdistance.eval(unicode(x, "UTF-8"), unicode(y, "UTF-8")),
        dld(unicode(x, "UTF-8"), unicode(y, "UTF-8"))
    ]
class TitleEditDist(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=-1):
        super(TitleEditDist, self).__init__()
        self.n_jobs = n_jobs
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.array(
            Parallel(self.n_jobs)(
                delayed(_title_edit_dist_transform)(x, y) for x, y in iter(X)
            )
        )

def _token_dist_transform(x, y):
    ax = set(analyzer(unicode(x, "UTF-8")))
    ay = set(analyzer(unicode(y, "UTF-8")))
    sx = ''.join(ax)
    sy = ''.join(ay)
    l = len(ax) + len(ay)
    if l==0:
        return 0.0
    c = 0
    for t in ax:
        if t in sy:
            c+=1
    for t in ay:
        if t in sx:
            c+=1
    return 1.0 * c / l
class TokenDist(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=-1):
        super(TokenDist, self).__init__()
        self.n_jobs = n_jobs
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.array(
            Parallel(self.n_jobs)(
                delayed(_token_dist_transform)(x, y) for x, y in iter(X)
            )
        ).reshape(-1,1)
    

class ConstantCol(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(ConstantCol, self).__init__()
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if sp.issparse(X):
            if X.shape[-1] == 0:
                return sp.csr_matrix(np.array([1.0]).repeat(X.shape[0]).reshape(-1,1))
            return sp.hstack((X, np.array([1]).repeat(X.shape[0]).reshape(-1,1))).tocsr()
        else:
            return np.hstack((X, np.array([1]).repeat(X.shape[0]).reshape(-1,1)))
    
    @staticmethod
    def test():
        tr = ConstantCol()
        print tr.fit_transform(np.array([[1,2],[3,4],[5,6]]))
        print tr.fit_transform(np.array([[],[],[]]))
        print tr.fit_transform(sp.csr_matrix(np.array([[1,2],[3,4],[5,6]]))).toarray()
        print tr.fit_transform(sp.csr_matrix(np.array([[],[],[]]))).toarray()
ConstantCol.test()



In [None]:

def _attrsJSON_dist(X):
    from sklearn.feature_extraction.text import CountVectorizer
    a = CountVectorizer().build_analyzer()
    def _inner(x, y):
        try:
            scores = []
            x = dict(
                ("___".join([k, v]), 1) if not isinstance(v, dict) else (k+"__", v)
                for k, v in json.loads(x).iteritems()
            )
            y = dict(
                ("___".join([k, v]), 1) if not isinstance(v, dict) else (k+"__", v)
                for k, v in json.loads(y).iteritems()
            )
            kx = set(x.keys())
            ky = set(y.keys())
            l = len(kx | ky)
            if l == 0:
                return 1.0
            for k in kx|ky:
                if all((k in x, k in y)):
                    if all((isinstance(x[k], int), isinstance(y[k], int))):
                        scores.append(1.0)
                    elif any((isinstance(x[k], int), isinstance(y[k], int))):
                        scores.append(1.0)
                    else:
                        dscore = []
                        for k1 in set(x[k].keys()) | set(y[k].keys()):
                            if any((k1 not in x[k], k1 not in y[k])):
                                dscore.append(0.0)
                            else:
                                # jaccard
                                j1 = set(a(x[k][k1]))
                                j2 = set(a(x[k][k1]))
                                if len(j1 | j2) == 0:
                                    dscore.append(1.0)
                                else:
                                    dscore.append(1.0 * len(j1&j2) / len(j1|j2))
                        scores.append(np.array(dscore).mean())
                else:
                    scores.append(0.0)
            if len(scores) == 0:
                return 1.0
            return 1.0 * np.array(scores).mean()
        except:
            import sys
            sys.excepthook(*sys.exc_info())
            return np.nan
    return np.frompyfunc(_inner, 2, 1)(X[:,0], X[:,1]).astype(float).reshape(-1,1)


In [None]:

class IdPairsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(IdPairsTransformer, self).__init__()
        pass
    
    def fit(self, X, y=None):
        self.digits_ = np.max(np.ceil(np.log10(np.nan_to_num(X.ravel()) + 1))) + 1
        return self
    
    def transform(self, X):
        xnew = np.zeros((X.shape[0], 2), dtype=X.dtype)
        xnew[:,0] = X[:,0] * 10**self.digits_ + X[:,1]
        xnew[:,1] = X[:,1] * 10**self.digits_ + X[:,0]
        return np.where(X[:,0] >= X[:,1], xnew[:,0], xnew[:,1]).reshape(-1,1)
    
    @staticmethod
    def test():
        tr = Pipeline([
                ('tr1', IdPairsTransformer()),
                ('tr2', IdRecoder()),
            ])
        assert (tr.fit_transform(np.array([
                    [634050, 634050],
                    [634010, 634050],
                    [634050, 634010],
                    [634000, 634000],
                ])) == np.array([
                    [2.],
                    [0.],
                    [0.],
                    [1.],
                ])).all()
IdPairsTransformer.test()


In [None]:

class PriceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(PriceTransformer, self).__init__()
        pass
    
    def fit(self, X, y=None):
        self.encoder_ = LabelBinarizer()
        self.encoder_.fit(
            np.nan_to_num(
                np.round(
                    np.log10(X+1) - 0.2
                ).ravel()))
        return self
    
    def transform(self, X):
        res = self.encoder_.transform(
            np.nan_to_num(
                np.round(
                    np.log10(X+1) - 0.2
                ).ravel())
        )
        xnew = np.zeros(res.shape[0], dtype=float)
        for i, n in enumerate(range(res.shape[1]), 1):
            xnew[res[:,n]==1] = i
        return xnew.reshape(X.shape)
    
    def fit_transform(self, X, y=None):
        self.encoder_ = LabelBinarizer()
        res = self.encoder_.fit_transform(
            np.nan_to_num(
                np.round(
                    np.log10(X+1) - 0.2
                ).ravel()))
        xnew = np.zeros(res.shape[0], dtype=float)
        for i, n in enumerate(range(res.shape[1]), 1):
            xnew[res[:,n]==1] = i
        return xnew.reshape(X.shape)
    
class PriceOutlierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(PriceOutlierTransformer, self).__init__()
        pass
    
    def fit(self, X, y=None):
        self.cats_ = np.unique(X[:,0])
        self.median_ = np.array([
                np.median(np.nan_to_num(np.log(X[X[:,0]==cat][:,[1,2]].ravel())))
                for cat in self.cats_
            ])
        self.std_ = np.array([
                np.std(np.nan_to_num(np.log(X[X[:,0]==cat][:,[1,2]].ravel())))
                for cat in self.cats_
            ])
        return self
    
    def transform(self, X):
        X[:,[1,2]] = np.nan_to_num(np.log(X[:,[1,2]]))
        xnew = np.zeros((X.shape[0], 3), dtype=float)
        for cat, median, std in itertools.izip(self.cats_, self.median_, self.std_):
            mask = X[:,0] == cat
            for m in range(xnew.shape[-1]):
                xnew[mask,m] += np.sum((X[mask][:,[1,2]] > median + (m+1) * std) + (X[mask][:,[1,2]] < median - (m+1) * std), axis=1)
        return xnew
    


In [None]:

def CoordAnchor_get_dist(lats, lons, clat, clon):
    return np.min(
        np.hstack((
                np.sqrt( (lats - clat)**2 + (lons - clon)**2 ).reshape(-1,1),
                np.sqrt( (lats - clat)**2 + (-1 * lons - clon)**2 ).reshape(-1,1),
            )), axis=1
    )

class CoordAnchor(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=-1):
        super(CoordAnchor, self).__init__()
        self.n_jobs = n_jobs
        pass
    
    def fit(self, X, y=None):
        xflat = np.hstack((
            X[:,[0,1]].reshape(-1,1),
            X[:,[2,3]].reshape(-1,1),
            X[:,[4,5]].reshape(-1,1),
        ))
        self.uniqs_ = np.array(list(itertools.izip(
                    *list(sorted(
                        itertools.izip(
                            *list(np.unique(xflat[:,0], return_counts=True))
                        ), key=lambda x: x[1], reverse=True
                    ))
        ))[0])
        self.centers_ = np.array([
                (xflat[mask,1].mean(), xflat[mask,2].mean())
                for mask in (xflat[:,0] == i for i in self.uniqs_)
        ])
        return self
    
    def transform(self, X):
        xflat = np.hstack((
            X[:,[0,1]].reshape(-1,1),
            X[:,[2,3]].reshape(-1,1),
            X[:,[4,5]].reshape(-1,1),
        ))
        xids = np.zeros(xflat.shape[0])
        xcurdists = np.zeros(xflat.shape[0], dtype=float)

#         for i, xcurdists in itertools.izip(
#             self.uniqs_, Parallel(self.n_jobs)(
#                 delayed(CoordAnchor_get_dist)(xflat[:,1], xflat[:,2], lat, lon)
#                 for lat, lon in self.centers_
#             )
#         ):
        for i, (lat, lon) in itertools.izip(self.uniqs_, self.centers_):
            xcurdists = np.min(
                np.hstack((
                        np.sqrt( (xflat[:,1] - lat)**2 + (xflat[:,2] - lon)**2 ).reshape(-1,1),
                        np.sqrt( (xflat[:,1] - lat)**2 + (-1 * xflat[:,2] - lon)**2 ).reshape(-1,1),
                    )), axis=1
            )
            try:
                mask = xcurdists < xdists
            except NameError:
                xdists = xcurdists
                mask = xcurdists <= xdists
            xids[mask] = i
            xdists[mask] = xcurdists[mask]
        return xids.reshape(-1, 2)
    
    @staticmethod
    def test():
        tr = CoordAnchor()
        assert (tr.fit_transform(np.array([
                    [1, 1, 29.0, 28.0, 31.0, 32.0],
                    [1, 1, 31.0, 28.0, 28.0, 32.0],
                    [2, 2, 61.0, 57.0, 62.0, 59.0],
                    [2, 2, 59.0, 57.0, 62.0, 61.0],
                    [1, 2, 59.0, 57.0, 62.0, 61.0],
                ])) == np.array([
                    [1, 1],
                    [1, 1],
                    [2, 2],
                    [2, 2],
                    [2, 2],
                ])).all()
CoordAnchor.test()

class MyLabelBinarizer(LabelBinarizer):
    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
        super(MyLabelBinarizer, self).__init__(neg_label, pos_label, sparse_output)
    
    def fit(self, X, y=None):
        return super(MyLabelBinarizer, self).fit(X)


In [None]:

class FreqTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(FreqTransformer, self).__init__()
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        xnew = np.zeros(X.shape).ravel()
        uniqs = {k: v for k, v in itertools.izip(*np.unique(X.ravel(), return_counts=True))}
        xnew = np.frompyfunc(lambda x: uniqs[x], 1, 1)(X.ravel())
        return xnew.reshape(X.shape).astype(float)
    
    @staticmethod
    def test():
        tr = FreqTransformer()
        assert (tr.fit_transform(np.array([
                    [1,3],
                    [5,2],
                    [1,5],
                    [3,5],
                ])) == np.array([
                    [2,2],
                    [3,1],
                    [2,3],
                    [2,3],
                ])).all()
# FreqTransformer.test()


In [None]:

def _pairs_dist_transform(X, metrics):
    from sklearn.metrics.pairwise import pairwise_distances
    l = X.shape[-1] / 2
    return np.array([
        [
            pairwise_distances(X[i,range(0, l)].reshape(1,-1), X[i,range(l, l*2)].reshape(1,-1), metric).ravel()[0]
            for metric in metrics
        ]
        for i in range(X.shape[0])
    ])
    
class PairsDist(BaseEstimator, TransformerMixin):
    def __init__(self, metrics, n_jobs=-1):
        super(PairsDist, self).__init__()
        self.metrics = metrics
        self.n_jobs = n_jobs
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        n_jobs = self.n_jobs
        if n_jobs == -1:
            n_jobs = cpu_count()
        size = X.shape[0]
        chsz, extra = divmod(size, n_jobs)
        if extra:
            chsz += 1
        if size <= n_jobs:
            n_jobs = max(size, 1)
            chsz = 1
        return np.vstack(
            Parallel(n_jobs)(
                delayed(_pairs_dist_transform)(X[i:min(i+chsz, size)], self.metrics)
                for i in xrange(0, size, chsz)
            )
        )


In [None]:

def get_train_indices(column="categoryID"):
    import gzip
    import math
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.utils import shuffle
    from xgboost import XGBClassifier
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")
    if column == "all":
        return prtrain.index
    
    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)
    prtest = pd.merge(prtest,
                     pd.read_csv("data/ItemInfo_test.csv_%s.csv" % column, index_col=0),
                     left_on="itemID_1", right_index=True, how="inner", sort=False)
    catdist = prtest[column].value_counts() / len(prtest)
    del prtest
    
    prtrain = pd.merge(prtrain,
                       pd.read_csv("data/ItemInfo_train.csv_%s.csv" % column, index_col=0),
                       left_on="itemID_1", right_index=True, how="inner", sort=False)
    
    indices = np.array([])
    trainsize = len(prtrain)
    for cat, dist in catdist.iteritems():
        trcatdist = len(prtrain[prtrain[column] == cat])
        if trcatdist < int(1.0 * dist * trainsize):
            trainsize = int(1.0 * trainsize * trcatdist / (dist * trainsize))
    for cat, dist in catdist.iteritems():
        indices = np.hstack((indices, shuffle(prtrain[prtrain[column] == cat].index, random_state=1)[:int(dist*trainsize)]))
    
    indices = pd.Index(np.sort(indices.astype(int)))
    return indices


In [None]:

def _get_X(mode, indices, columns):
    params = {}
    if mode == "test":
        params["index_col"] = 0
    if indices == "all":
        pr = pd.read_csv("data/ItemPairs_%s.csv" % mode, **params)
    else:
        pr = pd.read_csv("data/ItemPairs_%s.csv" % mode, **params).loc[indices]

    sourcecols = len(pr.columns)
    for colname in columns:
        if colname.startswith("pair_"):
            colname = colname[5:]
            colfn = ("data/ItemPairs_%s.csv" % mode) + "_" + colname + ".csv"
            column = pd.read_csv(colfn, index_col=0)
            pr = pr.merge(column[[colname]], how="left", left_index=True, right_index=True, sort=False)
            del column
        elif colname == 'itemID':
            pr["itemID_x"] = pr["itemID_1"]
            pr["itemID_y"] = pr["itemID_2"]
        else:
            colfn = ("data/ItemInfo_%s.csv" % mode) + "_" + colname + ".csv"
            column = pd.read_csv(colfn, index_col=0)
            if colname in ("title", "description", "stoptitle", "stopdescription"):
                column = column.fillna("nan")
                column[column==""] = "nan"
            elif colname in ("images_array",):
                column = column.fillna("")
            elif colname in ("attrsJSON",):
                column = column.fillna("{}")
                column[column==""] = "{}"
            elif False and colname in ("price",):
                column = column.fillna(0.0)
                column = preprocess_price(column)
            elif colname in ("categoryID", "parentCategoryID"):
                column = column.fillna(0)
            elif colname in ("locationID", "regionID", "metroID"):
                column = column.fillna(1)
            pr = pr.merge(column[[colname]], how="left", left_on="itemID_1", right_index=True, sort=False)
            pr = pr.merge(column[[colname]], how="left", left_on="itemID_2", right_index=True, sort=False)
            del column
        if False and colname in ("price",):
            pr = preprocess_pair_price(pr)
#     print pr[pr.columns[sourcecols:]]
    return pr[pr.columns[sourcecols:]].values

@memory.cache
def _get_transformer(mode, indices, columns, transformer):
    params = {}
    if mode == "test":
        params["index_col"] = 0
    tr = clone(transformer)
    output = tr.fit_transform(
        _get_X(mode, indices, columns),
        pd.read_csv("data/ItemPairs_%s.csv" % mode, **params).loc[indices]["isDuplicate"].values)
    return tr, output


@memory.cache
def _fit_transform(mode, indices, columns, transformer):
    _, output = _get_transformer("train", indices, columns, transformer)
    return output


@memory.cache
def _transform(mode, train, test, columns, transformer):
    tr, _ = _get_transformer("train", train, columns, transformer)
    output = tr.transform(_get_X(mode, test, columns))
    return output

def get_xgb_feat_importances(clf):
    import xgboost as xgb

    if isinstance(clf, xgb.XGBModel):
        # clf has been created by calling
        # xgb.XGBClassifier.fit() or xgb.XGBRegressor().fit()
        fscore = clf.booster().get_fscore()
    else:
        # clf has been created by calling xgb.train.
        # Thus, clf is an instance of xgb.Booster.
        fscore = clf.get_fscore()

    feat_importances = []
    for ft, score in fscore.iteritems():
        feat_importances.append({'Feature': ft, 'Importance': score})
    feat_importances = pd.DataFrame(feat_importances)
    feat_importances = feat_importances.sort_values(
        by='Feature', ascending=True).reset_index(drop=True)
    print feat_importances.head()
    feat_importances = feat_importances.sort_values(
        by='Importance', ascending=False).reset_index(drop=True)
    # Divide the importances by the sum of all importances
    # to get relative importances. By using relative importances
    # the sum of all importances will equal to 1, i.e.,
    # np.sum(feat_importances['importance']) == 1
    feat_importances['Importance'] /= feat_importances['Importance'].sum()
    # Print the most important features and their importances
    print feat_importances.head()
    return feat_importances

def _get_first_col(X):
    return X[:,[0]]

def _binary_jaccard(X):
    sm = X.sum(axis=1)
    return np.hstack((
            sm[:,:(sm.shape[-1]/2)].sum(axis=1),
            sm[:,(sm.shape[-1]/2):].sum(axis=1),
        )).astype(float) / sm

def _price_diff(X):
    return (np.nan_to_num(np.abs(X[:,0] - X[:,1])) / (np.nan_to_num(np.max(X[:,[0,1]], axis=1)) + 1.0)).reshape(-1,1)

def _text_count(X):
    a = CountVectorizer().build_analyzer()
    return np.frompyfunc(lambda x: len(a(x)), 1, 1)(X.ravel()).reshape(X.shape).astype(float)

def _text_concat(X):
    xnew = np.zeros((X.shape[0], 2), dtype=X.dtype)
    xnew[:,0] = X[:,0] + ' ' + X[:,2]
    xnew[:,1] = X[:,1] + ' ' + X[:,3]
    return xnew

def _print_return(X):
    print X
    return X

def _get_features(**kwargs):
    from xgboost import XGBClassifier

    if kwargs.get("pcats", None) is None:
        pcats = [1, 2, 4, 5, 6, 7, 8, 35, 110, 113]
    else:
        pcats = kwargs["pcats"]
    
    if kwargs.get("cats", None) is None:
        cats = [9, 10, 11, 14, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 38, 39, 40, 42, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 101, 102, 105, 106, 111, 112, 114, 115, 116]
    else:
        cats = kwargs["cats"]
    
    if kwargs.get("linear", None) is None:
        labelbin = []
    else:
        labelbin = [("labelvectorizer", MyLabelBinarizer(sparse_output=True))]
    
    return [
    ] + [
        (["parentCategoryID"], Pipeline([
                    ('selector', Selector(include=[0])),
                ] + labelbin)),
        (["categoryID"], Pipeline([
                    ('selector', Selector(include=[0])),
                ] + labelbin)),
    ] + [
        (["attrsJSON"], Pipeline([
                    ('jaccard', FunctionTransformer(_attrsJSON_dist, validate=False)),
                ])),
    ] + [
#         (["categoryID", "attrsJSON"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('attrs', Pipeline([
#                                                 ('tr1', JsonToDict(1)),
#                                                 ('tr2', JsonAttrsTrf(1)),
#                                                 ('imputer', ConstantCol()),
#                                             ])),
#                                 ])),
#                         ('est', LogisticRegression(C=0.01)),
#                         ]), switch=0, val=categoryID, xcols=[2,3]))
#         for categoryID in cats
    ] + [
        (["parentCategoryID", "attrsJSON"], FilteredSupervisedDecision(est=Pipeline([
                        ('feats', FeatureUnion(transformer_list=[
                                    ('attrs', Pipeline([
                                                ('tr1', JsonToDict(1)),
                                                ('tr2', JsonAttrsTrf(1)),
                                                ('imputer', ConstantCol()),
                                            ])),
                                ])),
                        ('est', LogisticRegression(C=0.01)),
                        ]), switch=0, val=parentCategoryID, xcols=[2,3]))
        for parentCategoryID in pcats
    ] + [
#         (["parentCategoryID", "stoptitle"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('title', Pipeline([
#                                                 ('transform', SwitchTransformer(est=CountVectorizer(min_df=10, binary=True), preaction="ravel", postaction=["xor"])),
#                                             ])),
#                                 ])),
#                         ('est', SGDClassifier(loss="modified_huber", alpha=0.001)),
#                         ]), switch=0, val=parentCategoryID, xcols=[2,3]))
#         for parentCategoryID in pcats
    ] + [
        (["parentCategoryID", "stoptitle"], FilteredSupervisedDecision(est=Pipeline([
                        ('feats', FeatureUnion(transformer_list=[
                                    ('title', Pipeline([
                                                ("transform", SwitchTransformer(est=Pipeline([
                                                                ('v', TfidfVectorizer(min_df=10, binary=True)),
                                                                ('svd', TruncatedSVD(100)),
                                                            ]), preaction="ravel")),
                                                ("pairdist", PairsDist(metrics=["cosine", "euclidean", "cityblock"])),
                                            ])),
                                ])),
                        ]), switch=0, val=parentCategoryID, xcols=[2,3]))
        for parentCategoryID in pcats
    ] + [
#         (["parentCategoryID", "stoptitle"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('title', Pipeline([
#                                                 ("transform", SwitchTransformer(est=Pipeline([
#                                                                 ('v', CountVectorizer(min_df=10, binary=True)),
#                                                                 ('cl', SupervisedDecision(MiniBatchKMeans(500), method="predict")),
#                                                             ]), preaction="ravel")),
#                                                 ("freq", IdPairsTransformer()),
#                                                 ("binrz", IdRecoder()),
#                                             ])),
#                                 ])),
#                         ]), switch=0, val=parentCategoryID, xcols=[2,3]))
#         for parentCategoryID in pcats
    ] + [
        (["parentCategoryID", "stoptitle"], FilteredSupervisedDecision(est=Pipeline([
                        ('feats', FeatureUnion(transformer_list=[
                                    ('title', Pipeline([
                                                ("transform", SwitchTransformer(est=Pipeline([
                                                                ('v', CountVectorizer(min_df=10, binary=True)),
                                                                ('svd', TruncatedSVD(100)),
                                                                ('cl', SupervisedDecision(MiniBatchKMeans(500), method="predict")),
                                                            ]), preaction="ravel")),
                                                ("freq", IdPairsTransformer()),
                                                ("binrz", IdRecoder()),
                                            ])),
                                ])),
                        ]), switch=0, val=parentCategoryID, xcols=[2,3]))
        for parentCategoryID in pcats
    ] + [
        (["parentCategoryID", "stoptitle"], FilteredSupervisedDecision(est=Pipeline([
                        ('feats', FeatureUnion(transformer_list=[
                                    ('title', Pipeline([
                                                ('transform', SwitchTransformer(est=CountVectorizer(analyzer="char_wb", min_df=10, ngram_range=(1,5), binary=True), preaction="ravel", postaction=["and", "xor"])),
                                            ])),
                                ])),
                        ('est', LogisticRegression(C=0.01)),
                        ]), switch=0, val=parentCategoryID, xcols=[2,3]))
        for parentCategoryID in pcats
    ] + [
#         (["categoryID", "stoptitle"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('title', Pipeline([
#                                                 ('transform', SwitchTransformer(est=CountVectorizer(min_df=10, binary=True), preaction="ravel", postaction=["and", "xor"])),
#                                             ])),
#                                 ])),
#                         ('est', LogisticRegression(C=0.01)),
#                         ]), switch=0, val=categoryID, xcols=[2,3]))
#         for categoryID in cats
    ] + [
#         (["parentCategoryID", "stopdescription"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('title', Pipeline([
#                                                 ('transform', SwitchTransformer(est=CountVectorizer(min_df=10, binary=True), preaction="ravel", postaction=["xor"])),
#                                             ])),
#                                 ])),
#                         ('est', LogisticRegression(C=0.01)),
#                         ]), switch=0, val=parentCategoryID, xcols=[2,3]))
#         for parentCategoryID in pcats
    ] + [
        (["parentCategoryID", "description"], FilteredSupervisedDecision(est=Pipeline([
                        ('feats', FeatureUnion(transformer_list=[
                                    ('desc', Pipeline([
                                                ('transform', SwitchTransformer(est=CountVectorizer(binary=True, min_df=10), preaction="ravel", postaction=["and", "xor"])),
                                            ])),
                                ])),
                        ('est', LogisticRegression(C=0.01)),
                        ]), switch=0, val=parentCategoryID, xcols=[2,3]))
        for parentCategoryID in pcats
    ] + [
#         (["categoryID", "description"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('desc', Pipeline([
#                                                 ('transform', SwitchTransformer(est=CountVectorizer(binary=True, min_df=10), preaction="ravel", postaction=["and", "xor"])),
#                                             ])),
#                                 ])),
#                         ('est', LogisticRegression(C=0.01)),
#                         ]), switch=0, val=categoryID, xcols=[2,3]))
#         for categoryID in cats
    ] + [
#         (["title"], Pipeline([
#                     ("titledist", TitleDist()),
#                 ])),
#         (["stoptitle"], Pipeline([
#                     ("titledist", TitleDist()),
#                 ])),
        (["stoptitle"], Pipeline([
                    ("titledist", TitleDist(analyzer=CountVectorizer(min_df=10))),
                ])),
        (["stoptitle"], Pipeline([
                    ("titledist", TitleEditDist(n_jobs=2)),
                ])),
        (["stoptitle"], Pipeline([
                    ("titledist", TokenDist(n_jobs=2)),
                ])),
#         (["stoptitle"], Pipeline([
#                     ('title', Pipeline([
#                                 ("transform", SwitchTransformer(est=Pipeline([
#                                                 ('v', CountVectorizer(min_df=10, binary=True)),
#                                                 ('cl', SupervisedDecision(MiniBatchKMeans(3000), method="predict")),
#                                             ]), preaction="ravel")),
#                                 ("freq", IdPairsTransformer()),
#                                 ("binrz", IdRecoder()),
#                             ])),
#                 ])),
#         (["title"], Pipeline([
#                     ("titledist", TitleDist(analyzer=CountVectorizer(analyzer="char_wb", min_df=10, ngram_range=(1,4)))),
#                 ])),
        (["title"], Pipeline([
                    ("titledist", TitleDist(analyzer=CountVectorizer(analyzer="char_wb", min_df=10, ngram_range=(1,5)))),
                ])),
    ] + [
#         (["description"], Pipeline([
#                     ("titledist", TitleDist()),
#                 ])),
#         (["stopdescription"], Pipeline([
#                     ("titledist", TitleDist()),
#                 ])),
        (["stopdescription"], Pipeline([
                    ("titledist", TitleDist(analyzer=CountVectorizer(min_df=10))),
                ])),
        (["stopdescription"], Pipeline([
                    ("titledist", TitleEditDist(n_jobs=3)),
                ])),
        (["stopdescription"], Pipeline([
                    ("titledist", TokenDist(n_jobs=3)),
                ])),
#         (["description"], Pipeline([
#                     ("titledist", TitleDist(analyzer=CountVectorizer(analyzer="char_wb", min_df=10, ngram_range=(1,4)))),
#                 ])),
#         (["description"], Pipeline([
#                     ("titledist", TitleDist(analyzer=CountVectorizer(analyzer="char_wb", min_df=10, ngram_range=(1,5)))),
#                 ])),
    ] + [
#         (["title", "description"], Pipeline([
#                     ("concat", FunctionTransformer(_text_concat, validate=False)),
#                     ("titledist", TitleDist()),
#                 ])),
        (["stoptitle", "stopdescription"], Pipeline([
                    ("concat", FunctionTransformer(_text_concat, validate=False)),
                    ("titledist", TitleDist()),
                ])),
        (["stoptitle", "stopdescription"], Pipeline([
                    ("concat", FunctionTransformer(_text_concat, validate=False)),
                    ("editdist", TitleEditDist()),
                ])),
    ] + [
#         (["title"], Pipeline([
#                     ("counter", FunctionTransformer(_text_count, validate=False)),
#                     ("freq", IdPairsTransformer()),
#                     ("binrz", IdRecoder()),
#                 ] + labelbin)),
        (["stoptitle"], Pipeline([
                    ("counter", FunctionTransformer(_text_count, validate=False)),
                    ("freq", IdPairsTransformer()),
                    ("binrz", IdRecoder()),
                ] + labelbin)),
#         (["description"], Pipeline([
#                     ("counter", FunctionTransformer(_text_count, validate=False)),
#                     ("freq", IdPairsTransformer()),
#                     ("binrz", IdRecoder()),
#                 ] + labelbin)),
        (["stopdescription"], Pipeline([
                    ("counter", FunctionTransformer(_text_count, validate=False)),
                    ("freq", IdPairsTransformer()),
                    ("binrz", IdRecoder()),
                ] + labelbin)),
        (["images_array"], Pipeline([
                    ("counter", FunctionTransformer(_text_count, validate=False)),
                    ("freq", IdPairsTransformer()),
                    ("binrz", IdRecoder()),
                ] + labelbin)),
    ] + [
        (["itemID"], Pipeline([
                    ('freq', FreqTransformer()),
                    ('pairs', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ])),
        (["locationID"], SwitchTransformer(est=Pipeline([
                        ('recored', IdRecoder()),
                    ] + labelbin))),
        (["locationID", "lat", "lon"], LocationDist(list(np.logspace(-3.5, 6, 20, base=2)))),
        (["regionID"], SwitchTransformer(est=Pipeline([
                        ('recored', IdRecoder()),
                    ] + labelbin))),
        (["regionID", "lat", "lon"], LocationDist(list(np.logspace(-2, 7, 20, base=2)))),
        (["metroID"], SwitchTransformer(est=Pipeline([
                        ('recored', IdRecoder()),
                    ] + labelbin))),
        (["metroID", "lat", "lon"], LocationDist(list(np.logspace(-4, 4, 5, base=2)))),
        (["lat", "lon"], CoordDist()),
        (["lat", "lon"], VarianceThreshold()),
        (["locationID"], Pipeline([
                    ('freq', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ] + labelbin)),
        (["locationID", "lat", "lon"], Pipeline([
                    ('predict', CoordAnchor()),
                    ('combine', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ] + labelbin)),
#         (["locationID"], Pipeline([
#                     ('freq', IdFreq(1000)),
#                     ('binrz', EqNonEqBinarizer()),
#                 ])),
#         (["locationID"], SupervisedDecision(est=Pipeline([
#                         ('binrz', EqNonEqBinarizer()),
#                         ('est', LogisticRegression(C=1.0)),
#                 ]))),
        (["regionID", "lat", "lon"], Pipeline([
                    ('predict', CoordAnchor()),
                    ('freq', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ] + labelbin)),
        (["regionID"], Pipeline([
                    ('freq', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ] + labelbin)),
#         (["regionID"], Pipeline([
#                     ('binrz', EqNonEqBinarizer()),
#                 ])),
        (["metroID", "lat", "lon"], Pipeline([
                    ('predict', CoordAnchor()),
                    ('freq', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ] + labelbin)),
#         (["metroID"], SupervisedDecision(est=Pipeline([
#                         ('binrz', EqNonEqBinarizer()),
#                         ('est', LogisticRegression(C=1.0)),
#                 ]))),
        (["metroID"], Pipeline([
                    ('freq', IdPairsTransformer()),
                    ('binrz', IdRecoder()),
                ] + labelbin)),
#         (["metroID"], Pipeline([
#                     ('binrz', EqNonEqBinarizer()),
#                 ])),
        (["price"], Pipeline([
                    ('trf', FunctionTransformer(_price_diff, validate=False)),
                ])),
#         (["price"], Pipeline([
#                     ('binrz', EqNonEqBinarizer()),
#                 ])),
        (["price"], Pipeline([
                    ('trf', PriceTransformer()),
                    ('pairs', IdPairsTransformer()),
                    ('recoder', IdRecoder()),
                    ('noop', VarianceThreshold()),
                ] + labelbin)),
        (["categoryID", "price"], Pipeline([
                    ('selector', Selector(include=[1,2,3])),
                    ('trf', PriceOutlierTransformer()),
                ])),
        (["parentCategoryID", "price"], Pipeline([
                    ('selector', Selector(include=[1,2,3])),
                    ('trf', PriceOutlierTransformer()),
                ])),
        (["pair_imagesdist_Exact"], VarianceThreshold()),
        (["pair_imagesdist_Correlation_max"], Imputer()),
        (["pair_imagesdist_Chi-Squared_min"], Imputer()),
        (["pair_imagesdist_Intersection_max"], Imputer()),
        (["pair_imagesdist_Hellinger_min"], Imputer()),

        (["pair_imagesdist_Correlation_mean"], Imputer()),
        (["pair_imagesdist_Chi-Squared_mean"], Imputer()),
        (["pair_imagesdist_Intersection_mean"], Imputer()),
        (["pair_imagesdist_Hellinger_mean"], Imputer()),

        (["pair_imagesdist_scipy.braycurtis_min"], Imputer()),
        (["pair_imagesdist_scipy.canberra_min"], Imputer()),
        (["pair_imagesdist_scipy.chebyshev_min"], Imputer()),
        (["pair_imagesdist_scipy.cityblock_min"], Imputer()),
        (["pair_imagesdist_scipy.cosine_min"], Imputer()),
        (["pair_imagesdist_scipy.euclidean_min"], Imputer()),
        (["pair_imagesdist_scipy.hamming_min"], Imputer()),
        (["pair_imagesdist_scipy.sqeuclidean_min"], Imputer()),
        (["pair_imagesdist_skimage.compare_mse_min"], Imputer()),
        (["pair_imagesdist_skimage.compare_ssim_3_max"], Imputer()),
        (["pair_imagesdist_skimage.compare_ssim_5_max"], Imputer()),
        (["pair_imagesdist_skimage.compare_ssim_7_max"], Imputer()),
        (["pair_imagesdist_skimage.compare_mse.adapt_min"], Imputer()),
        (["pair_imagesdist_skimage.compare_ssim_3.adapt_max"], Imputer()),
        (["pair_imagesdist_skimage.compare_ssim_5.adapt_max"], Imputer()),
        (["pair_imagesdist_skimage.compare_ssim_7.adapt_max"], Imputer()),

#         (["pair_imagesdist_scipy.braycurtis_mean"], Imputer()),
#         (["pair_imagesdist_scipy.canberra_mean"], Imputer()),
#         (["pair_imagesdist_scipy.chebyshev_mean"], Imputer()),
#         (["pair_imagesdist_scipy.cityblock_mean"], Imputer()),
#         (["pair_imagesdist_scipy.cosine_mean"], Imputer()),
#         (["pair_imagesdist_scipy.euclidean_mean"], Imputer()),
#         (["pair_imagesdist_scipy.hamming_mean"], Imputer()),
#         (["pair_imagesdist_scipy.sqeuclidean_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_mse_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_ssim_3_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_ssim_5_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_ssim_7_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_mse.adapt_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_ssim_3.adapt_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_ssim_5.adapt_mean"], Imputer()),
#         (["pair_imagesdist_skimage.compare_ssim_7.adapt_mean"], Imputer()),
    ] + [
        (["pair_imagesdist_Correlation_%d" % _], VarianceThreshold())
        for _ in range(99, 97, -1)
#         for _ in [95, 90, 85, 75, 70, 65, 60]
#         for _ in [95, 90, 85]
    ] + [
        (["pair_imagesdist_Chi-Squared_%d" % _], VarianceThreshold())
        for _ in range(1,3)
#         for _ in [10, 20, 50, 100, 150, 200]
    ] + [
        (["pair_imagesdist_Intersection_%d" % _], VarianceThreshold())
        for _ in range(250,230,-10)
    ] + [
        (["pair_imagesdist_Hellinger_%d" % _], VarianceThreshold())
        for _ in range(1,3)
#         for _ in [10, 20]
#     ] + [
#         (["parentCategoryID", "attrsJSON"], FilteredSupervisedDecision(est=Pipeline([
#                         ('feats', FeatureUnion(transformer_list=[
#                                     ('attrs', Pipeline([
#                                                 ('transform', SwitchTransformer(est=CountVectorizer(binary=True), preaction="ravel", postaction=["and", "xor"])),
#                                             ])),
#                                 ])),
#                         ('est', LogisticRegression(C=0.01)),
#                         ]), switch=0, val=parentCategoryID, xcols=[2,3]))
#         for parentCategoryID in [1, 2, 4, 5, 6, 7, 8, 35, 110, 113]
    ]


def _get_estimator():
    from xgboost import XGBClassifier
    return XGBClassifier(
        n_estimators=int(os.environ.get("N_ESTIMATORS", "900")),
        max_depth=int(os.environ.get("MAX_DEPTH", "9"))
    )
    return XGBClassifier(n_estimators=1500, max_depth=5)
    from sklearn.linear_model import LogisticRegression
    return LogisticRegression(C=1.0, random_state=251)
    from sklearn.ensemble import RandomForestClassifier
    return RandomForestClassifier(n_estimators=700, max_depth=9, n_jobs=-1)


def _get_conf():
    return (
        [
            (["meta"], [
                    (["title"], Pipeline([
                                ('transform', SwitchTransformer(est=CountVectorizer(binary=True), preaction="ravel", postaction=["and", "xor"])),
                            ])),
                ], SupervisedDecision(est=LogisticRegression(C=1.0)))
        ] + [
            
        ],
        SupervisedDecision(est=XGBClassifier(n_estimators=100)))


def newcv(n_folds=5):
    import gzip
    import numpy as np
    import pandas as pd
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.cross_validation import cross_val_score
    from sklearn.metrics import get_scorer
    from sklearn.pipeline import Pipeline
    from sklearn.utils import shuffle
    from xgboost import XGBClassifier
    
    pr = pd.read_csv("data/ItemPairs_train.csv")

    kf = StratifiedKFold(pr["isDuplicate"].values, n_folds=n_folds, shuffle=True, random_state=42)
    
    feats = _get_features()
    scorer = get_scorer("roc_auc")
    
    scores = []
    
    for train, test in kf:
        start_time = datetime.datetime.now()
        
        itrain = shuffle(pr.iloc[train].index, random_state=42)
        itest = pr.iloc[test].index
        
        Xs = [_fit_transform("train", itrain, columns, transformer) for columns, transformer in feats]
        if any(sp.issparse(f) for f in Xs):
            Xs = sp.hstack(Xs).tocsr()
        else:
            Xs = np.hstack(Xs)

        print "Shape:", Xs.shape
        est = clone(_get_estimator())
        est.fit(Xs, pr.loc[itrain]["isDuplicate"])
        del Xs
        
        Xs = [_transform("train", itrain, itest, columns, transformer) for columns, transformer in feats]
        if any(sp.issparse(f) for f in Xs):
            Xs = sp.hstack(Xs).tocsr()
        else:
            Xs = np.hstack(Xs)
        
        scores.append(scorer(est, Xs, pr.loc[itest]["isDuplicate"]))
        
        print "CV:", (datetime.datetime.now() - start_time), Xs.shape, scores[-1]

        del Xs
        del est
        
    return np.array(scores)


def newtest(**kwargs):
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.utils import shuffle
    from xgboost import XGBClassifier
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")
#     prtrain = pd.merge(prtrain, pd.read_csv("data/ItemInfo_train.csv_categoryID.csv", index_col=0), left_on="itemID_1", right_index=True, how="inner", sort=False)
#     prtrain.drop(prtrain[prtrain.categoryID != 9].index, inplace=True)
    start_time = datetime.datetime.now()
    
    if kwargs.get("trainidx", None) is None:
        indices = shuffle(prtrain.index, random_state=42)
    else:
        indices = kwargs["trainidx"]
    prtrain = prtrain.loc[indices]

    if kwargs.get("feats", None) is None:
        feats = _get_features()
    else:
        feats = kwargs["feats"]

    featsum = []
#     Xs = [_fit_transform("train", indices, columns, transformer) for columns, transformer in feats]
#     Xs = []
    Xsparse = []
    featscount = 0
    for columns, transformer in feats:
        x = _fit_transform("train", indices, columns, transformer)
        print columns, x.shape
        featsum.append([columns, x.shape, ['f%d' % _ for _ in range(featscount, featscount+x.shape[1])]])
        Xsparse.append(sp.issparse(x))
        featscount += x.shape[1]
        del x
    print("FITTED")

    Xs = None
    for columns, transformer in feats:
        print("Loading: %s" % str(columns))
        if Xs is None:
            Xs = _fit_transform("train", indices, columns, transformer).astype(float)
            if any(Xsparse):
                Xs = sp.lil_matrix(Xs)
        else:
            if any(Xsparse):
                Xs = sp.hstack((
                        Xs,
                        _fit_transform("train", indices, columns, transformer).astype(float)
                    ))
            else:
                Xs = np.hstack((
                        Xs,
                        _fit_transform("train", indices, columns, transformer).astype(float)
                    ))
    if sp.issparse(Xs):
        Xs = Xs.tocsr()

    print "Shape:", Xs.shape
    
    if os.environ.get("FNAMES", "") != "":
        for fname in os.environ["FNAMES"].split(","):
            ff = pd.read_csv("features/%s.csv" % fname, index_col=range(2))
            fcolnames = [_ for _ in ff.columns if not _.endswith("_x") and not _.endswith("_y")]
            ff = pd.merge(
                prtrain.loc[indices],
                ff,
                left_on=["itemID_1", "itemID_2"],
                right_index=True,
                how="left", sort=False
            )
            Xs = sp.hstack((
                    Xs,
                    ff.loc[indices][fcolnames].values.astype(float)
                ))
        print "FNAMES Shape:", Xs.shape

    if kwargs.get("est", None) is None:
        est = clone(_get_estimator())
    else:
        est = clone(kwargs["est"])
    
    gc.collect()
    if kwargs.get("fitidx", None) is None:
        fitidx = indices
        if os.environ.get("STRATIFIED", None) is not None:
            fitidx = shuffle(get_train_indices(os.environ.get("STRATIFIED")), random_state=42)
    else:
        fitidx = kwargs["fitidx"]
    
    if len(fitidx) != len(indices) or (fitidx != indices).all():
        mask = indices.isin(fitidx)
        print "Fit Shape:", Xs[mask].shape
        est.fit(Xs[mask], prtrain.loc[mask]["isDuplicate"])
    else:
        est.fit(Xs, prtrain.loc[indices]["isDuplicate"])

    if os.environ.get("RESTRAIN", None) is not None:
        prtrain["probability"] = est.predict_proba(Xs)[:,1]
        with gzip.open(os.environ.get("RESTRAIN"), "wb") as f:
            prtrain[["probability"]].to_csv(f, index_label="id")
    del Xs
    
    try:
        fi = get_xgb_feat_importances(est)
        featsum = [(fi[fi.Feature.isin(_[2])]["Importance"].sum(), _[0], _[1]) for _ in featsum]
        for _ in sorted(featsum, key=lambda x: x[0], reverse=True):
            print "%.06f" % _[0], _[1], _[2]
#         for findex, fscore in iter(fi.values[:50]):
#             print fscore, featsum[int(findex.replace('f', ''))]
    except:
        # not xgb
        import sys
        sys.excepthook(*sys.exc_info())

    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)
#     prtest = pd.merge(prtest, pd.read_csv("data/ItemInfo_test.csv_categoryID.csv", index_col=0), left_on="itemID_1", right_index=True, how="inner", sort=False)
#     prtest.drop(prtest[prtest.categoryID != 9].index, inplace=True)

    if kwargs.get("testidx", None) is None:
        testidx = prtest.index
    else:
        testidx = kwargs["testidx"]
        prtest = prtest.loc[testidx]

    Xsparse = []
    for columns, transformer in feats:
        x = _transform("test", indices, testidx, columns, transformer)
        print columns, x.shape
        Xsparse.append(sp.issparse(x))
        del x
        
    Xs = None
    for columns, transformer in feats:
        if Xs is None:
            Xs = _transform("test", indices, testidx, columns, transformer).astype(float)
            if any(Xsparse):
                Xs = sp.csr_matrix(Xs)
        else:
            if any(Xsparse):
                Xs = sp.hstack((
                        Xs,
                        _transform("test", indices, testidx, columns, transformer).astype(float)
                    )).tocsr()
            else:
                Xs = np.hstack((
                        Xs,
                        _transform("test", indices, testidx, columns, transformer).astype(float)
                    ))
    
    del prtrain
    
    if os.environ.get("FNAMES", "") != "":
        for fname in os.environ["FNAMES"].split(","):
            ff = pd.read_csv("features/%s_test.csv" % fname, index_col=range(2))
            fcolnames = [_ for _ in ff.columns if not _.endswith("_x") and not _.endswith("_y")]
            ff = pd.merge(
                prtest.loc[testidx],
                ff,
                left_on=["itemID_1", "itemID_2"],
                right_index=True,
                how="left", sort=False
            )
            Xs = sp.hstack((
                    Xs,
                    ff.loc[testidx][fcolnames].values.astype(float)
                ))
        print "FNAMES Shape:", Xs.shape
    
    prtest["probability"] = est.predict_proba(Xs)[:,1]
    with gzip.open(os.environ.get("RESTEST", "/tmp/avito.csv.gz"), "wb") as f:
        prtest[["probability"]].to_csv(f)

    print "TEST:", (datetime.datetime.now() - start_time), Xs.shape

    return


def _score(**kwargs):
    import pandas as pd
    from sklearn.metrics import get_scorer
    from sklearn.utils import shuffle
    from xgboost import XGBClassifier

    prtrain = pd.read_csv("data/ItemPairs_train.csv")
    start_time = datetime.datetime.now()
    
    if kwargs.get("trainidx", None) is None:
        indices = shuffle(prtrain.index, random_state=42)
        if os.environ.get("STRATIFIED", None) is not None:
            indices = shuffle(get_train_indices(os.environ.get("STRATIFIED")), random_state=42)
    else:
        indices = kwargs["trainidx"]
    prtrain = prtrain.loc[indices]

    if kwargs.get("feats", None) is None:
        feats = _get_features()
    else:
        feats = kwargs["feats"]

    featsum = []
#     Xs = [_fit_transform("train", indices, columns, transformer) for columns, transformer in feats]
#     Xs = []
    Xsparse = []
    featscount = 0
    for columns, transformer in feats:
        x = _fit_transform("train", indices, columns, transformer)
        print columns, x.shape
        featsum.append([columns, x.shape, ['f%d' % _ for _ in range(featscount, featscount+x.shape[1])]])
        Xsparse.append(sp.issparse(x))
        featscount += x.shape[1]
        del x
    print("FITTED")

    Xs = None
    for columns, transformer in feats:
        print("Loading: %s" % str(columns))
        if Xs is None:
            Xs = _fit_transform("train", indices, columns, transformer).astype(float)
            if any(Xsparse):
                Xs = sp.lil_matrix(Xs)
        else:
            if any(Xsparse):
                Xs = sp.hstack((
                        Xs,
                        _fit_transform("train", indices, columns, transformer).astype(float)
                    ))
            else:
                Xs = np.hstack((
                        Xs,
                        _fit_transform("train", indices, columns, transformer).astype(float)
                    ))
    if sp.issparse(Xs):
        Xs = Xs.tocsr()

    print "Shape:", Xs.shape
    
    if kwargs.get("est", None) is None:
        est = clone(_get_estimator())
    else:
        est = clone(kwargs["est"])
    
    if kwargs.get("fitidx", None) is None:
        est.fit(Xs, prtrain.loc[indices]["isDuplicate"])
    else:
        fitidx = kwargs["fitidx"]
        mask = indices.isin(fitidx)
        est.fit(Xs[mask], prtrain.loc[mask]["isDuplicate"])
#     if os.environ.get("RESTRAIN", None) is not None:
#         prtrain["probability"] = est.predict_proba(Xs)[:,1]
#         with gzip.open(os.environ.get("RESTRAIN"), "wb") as f:
#             prtrain[["probability"]].to_csv(f, index_label="id")
    del Xs
    
    prtest = pd.read_csv("data/ItemPairs_train.csv")

    if kwargs.get("testidx", None) is None:
        testidx = prtest.index
    else:
        testidx = kwargs["testidx"]
        prtest = prtest.loc[testidx]

    Xsparse = []
    for columns, transformer in feats:
        x = _transform("train", indices, testidx, columns, transformer)
        print columns, x.shape
        Xsparse.append(sp.issparse(x))
        del x
        
    Xs = None
    for columns, transformer in feats:
        if Xs is None:
            Xs = _transform("train", indices, testidx, columns, transformer).astype(float)
            if any(Xsparse):
                Xs = sp.csr_matrix(Xs)
        else:
            if any(Xsparse):
                Xs = sp.hstack((
                        Xs,
                        _transform("train", indices, testidx, columns, transformer).astype(float)
                    )).tocsr()
            else:
                Xs = np.hstack((
                        Xs,
                        _transform("train", indices, testidx, columns, transformer).astype(float)
                    ))
    
    del prtrain
    
#     prtest["probability"] = est.predict_proba(Xs)[:,1]
#     with gzip.open(os.environ.get("RESTEST", "/tmp/avito.csv.gz"), "wb") as f:
#         prtest[["probability"]].to_csv(f)

    score = get_scorer("roc_auc")(est, Xs, prtest["isDuplicate"].values)
    
    print "TEST:", (datetime.datetime.now() - start_time), Xs.shape

    return score


def _pcattest():
    import gc
    from sklearn.cross_validation import train_test_split
    from sklearn.grid_search import ParameterGrid
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")
    prtrain = pd.merge(
        prtrain,
        pd.read_csv("data/ItemInfo_train.csv_parentCategoryID.csv", index_col=0),
        left_on="itemID_1", right_index=True,
        how="inner", sort=False
    )

    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)
    prtest = pd.merge(
        prtest,
        pd.read_csv("data/ItemInfo_test.csv_parentCategoryID.csv", index_col=0),
        left_on="itemID_1", right_index=True,
        how="inner", sort=False
    )
    
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    for pcatid in np.unique(prtrain["parentCategoryID"].values):
        print "PARENT CATEGORY: %d" % pcatid
        os.environ["RESTRAIN"] = "/tmp/avito-pcat-%d-train.csv.gz" % pcatid
        os.environ["RESTEST"] = "/tmp/avito-pcat-%d-test.csv.gz" % pcatid
        if all((os.path.exists(os.environ["RESTRAIN"]),
                os.path.exists(os.environ["RESTEST"]),
                os.environ.get("RECALC", None) is None)):
            continue

        scores = []
        fulltrain = shuffle(prtrain[prtrain["parentCategoryID"] == pcatid].index, random_state=42)
        fulltest = prtest[prtest["parentCategoryID"] == pcatid].index
        train, test = train_test_split(fulltrain, test_size=1.0 * len(fulltest) / len(fulltrain), random_state=42)
        param_grid = ParameterGrid({
                "n_estimators": [50, 100, 200, 300, 500, 700],
                "max_depth": [3, 4, 5, 6, 7],
            })
        param_grid = ParameterGrid({
                "n_estimators": [50, 100, 200, 300, 500],
                "max_depth": [3, 4, 5],
            })
        
        for param_set in param_grid:
            scores.append({
                    "params": param_set,
                    "score": _score(
                        trainidx=train,
                        testidx=test,
                        feats = _get_features(
                            pcats=[pcatid],
                            cats=list(cats[cats["parentCategoryID"] == pcatid].index)
                        ),
                        est=XGBClassifier(**param_set))
                })
            print scores[-1]
        
        scores = sorted(scores, key=lambda x: x["score"], reverse=True)
        for score in scores:
            print score["score"], score["params"]
        
        newtest(
            trainidx=fulltrain,
            testidx=fulltest,
            feats = _get_features(
                pcats=[pcatid],
                cats=list(cats[cats["parentCategoryID"] == pcatid].index)
            ),
            est=XGBClassifier(**scores[0]["params"])
        )
        gc.collect()


def _cattest():
    import gc
    from sklearn.cross_validation import train_test_split
    from sklearn.grid_search import ParameterGrid
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")
    prtrain = pd.merge(
        prtrain,
        pd.read_csv("data/ItemInfo_train.csv_categoryID.csv", index_col=0),
        left_on="itemID_1", right_index=True,
        how="inner", sort=False
    )

    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)
    prtest = pd.merge(
        prtest,
        pd.read_csv("data/ItemInfo_test.csv_categoryID.csv", index_col=0),
        left_on="itemID_1", right_index=True,
        how="inner", sort=False
    )
    
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    for catid in np.unique(prtrain["categoryID"].values):
        print "CATEGORY: %d" % catid
        os.environ["RESTRAIN"] = "/tmp/avito-cat-%d-train.csv.gz" % catid
        os.environ["RESTEST"] = "/tmp/avito-cat-%d-test.csv.gz" % catid
        if all((os.path.exists(os.environ["RESTRAIN"]),
                os.path.exists(os.environ["RESTEST"]),
                os.environ.get("RECALC", None) is None)):
            continue
        
        scores = []
        fulltrain = shuffle(prtrain[prtrain["categoryID"] == catid].index, random_state=42)
        fulltest = prtest[prtest["categoryID"] == catid].index
        train, test = train_test_split(fulltrain, test_size=1.0 * len(fulltest) / len(fulltrain), random_state=42)
        param_grid = ParameterGrid({
                "n_estimators": [30, 50, 70, 100, 150, 200],
                "max_depth": [1, 2, 3, 4, 5],
            })
        param_grid = ParameterGrid({
                "n_estimators": [30, 50, 70, 100],
                "max_depth": [1, 2, 3],
            })
        
        for param_set in param_grid:
            scores.append({
                    "params": param_set,
                    "score": _score(
                        trainidx=train,
                        testidx=test,
                        feats = _get_features(
                            pcats=[cats.loc[catid, "parentCategoryID"]],
                            cats=[catid]
                        ),
                        est=XGBClassifier(**param_set))
                })
            print scores[-1]
        
        scores = sorted(scores, key=lambda x: x["score"], reverse=True)
        for score in scores:
            print score["score"], score["params"]
        
        newtest(
            trainidx=fulltrain,
            testidx=fulltest,
            feats = _get_features(
                pcats=[cats.loc[catid, "parentCategoryID"]],
                cats=[catid]
            ),
            est=XGBClassifier(**scores[0]["params"])
        )
        gc.collect()


def _gentest():
    import gc
    from sklearn.cross_validation import train_test_split
    from sklearn.grid_search import ParameterGrid
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")
    prtrain = pd.merge(
        prtrain,
        pd.read_csv("data/ItemInfo_train.csv_categoryID.csv", index_col=0),
        left_on="itemID_1", right_index=True,
        how="inner", sort=False
    )

    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)
    prtest = pd.merge(
        prtest,
        pd.read_csv("data/ItemInfo_test.csv_categoryID.csv", index_col=0),
        left_on="itemID_1", right_index=True,
        how="inner", sort=False
    )
    
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    for genmethod in np.unique(prtrain["generationMethod"].values):
        print "GENERATION METHOD: %d" % genmethod
        os.environ["RESTRAIN"] = "/tmp/avito-gen-%d-train.csv.gz" % genmethod
        os.environ["RESTEST"] = "/tmp/avito-gen-%d-test.csv.gz" % genmethod
        if all((os.path.exists(os.environ["RESTRAIN"]),
                os.path.exists(os.environ["RESTEST"]),
                os.environ.get("RECALC", None) is None)):
            continue
        
        scores = []
        fulltrain = shuffle(prtrain.index, random_state=42)
        fittrain = shuffle(prtrain[prtrain["generationMethod"] == genmethod].index, random_state=42)
        fulltest = prtest.index
        
        newtest(
            trainidx=fulltrain,
            testidx=fulltest,
            fitidx=fittrain
        )
        gc.collect()
    

def _catmerge():
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    _cattest()
    
    for catid in np.unique(cats.index):
        print "CATEGORY: %d" % catid
        with gzip.open("/tmp/avito-cat-%d-test.csv.gz" % catid) as fi:
            try:
                prtest = pd.concat((
                        prtest,
                        pd.read_csv(fi, index_col=0)
                    ), axis=0)
            except NameError:
                prtest = pd.read_csv(fi, index_col=0)
    prtest.sort_index(inplace=True)
    with gzip.open("/tmp/avito-catmerge.csv.gz", "wb") as fo:
        prtest[["probability"]].to_csv(fo)


def _pcatmerge():
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    _pcattest()
    
    for pcatid in np.unique(cats["parentCategoryID"].values):
        print "PARENT CATEGORY: %d" % pcatid
        with gzip.open("/tmp/avito-pcat-%d-test.csv.gz" % pcatid) as fi:
            try:
                prtest = pd.concat((
                        prtest,
                        pd.read_csv(fi, index_col=0)
                    ), axis=0)
            except NameError:
                prtest = pd.read_csv(fi, index_col=0)
    prtest.sort_index(inplace=True)
    with gzip.open("/tmp/avito-pcatmerge.csv.gz", "wb") as fo:
        prtest[["probability"]].to_csv(fo)


def _newcatmerge():
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    _cattest()
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")[["isDuplicate"]]
    prtrain = prtrain.loc[shuffle(prtrain.index, random_state=42)]
    cats = pd.read_csv("data/Category.csv", index_col=0)
    
    with gzip.open("/tmp/avito-train.csv.gz") as fi:
        prtrain["temp"] = pd.read_csv(fi, index_col=0)["probability"]
        X = sp.csr_matrix(prtrain[["temp"]].values.astype(float) - 0.5)

    for catid in np.unique(cats.index):
        print "CATEGORY: %d" % catid
        with gzip.open("/tmp/avito-cat-%d-train.csv.gz" % catid) as fi:
            prtrain["temp"] = pd.read_csv(fi, index_col=0)["probability"].astype(float) - 0.5
            X = sp.hstack((X, sp.csr_matrix(np.nan_to_num(prtrain[["temp"]].values))))

    print "Shape:", X.shape
    est = Pipeline([
            ('estimator', _get_estimator()),
        ])

    y = prtrain["isDuplicate"].values
    del prtrain
    est.fit(X, y)
    del X
    del y

    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)[[]]
    with gzip.open("/tmp/avito-test.csv.gz") as fi:
        prtest["temp"] = pd.read_csv(fi, index_col=0)["probability"]
        X = sp.csr_matrix(prtest[["temp"]].values.astype(float) - 0.5)

    for catid in np.unique(cats.index):
        print "CATEGORY: %d" % catid
        with gzip.open("/tmp/avito-cat-%d-test.csv.gz" % catid) as fi:
            prtest["temp"] = pd.read_csv(fi, index_col=0)["probability"].astype(float) - 0.5
            X = sp.hstack((X, sp.csr_matrix(np.nan_to_num(prtest[["temp"]].values))))
    prtest["probability"] = est.predict_proba(X)[:,1]
    del X
    
    prtest.sort_index(inplace=True)
    with gzip.open("/tmp/avito-newcatmerge.csv.gz", "wb") as fo:
        prtest[["probability"]].to_csv(fo)


def _genmerge():
    from sklearn.cross_validation import train_test_split
    from sklearn.grid_search import ParameterGrid
    from sklearn.linear_model import SGDClassifier
    from sklearn.metrics import classification_report
    from sklearn.metrics import get_scorer
    from sklearn.svm import LinearSVC
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")[["isDuplicate"]]
    prtrain = prtrain.loc[shuffle(prtrain.index, random_state=42)]
    
    with gzip.open("/tmp/avito-train.csv.gz") as fi:
        prtrain["temp"] = pd.read_csv(fi, index_col=0)["probability"]
        X = sp.csr_matrix(prtrain[["temp"]].values.astype(float))

#     with gzip.open("/tmp/avito-linear-train.csv.gz") as fi:
#         prtrain["temp"] = pd.read_csv(fi, index_col=0)["probability"]
#         X = sp.hstack((X, sp.csr_matrix(np.nan_to_num(prtrain[["temp"]].values))))

    for genmethod in (1, 3):
        print "METHOD: %d" % genmethod
        with gzip.open("/tmp/avito-gen-%d-train.csv.gz" % genmethod) as fi:
            prtrain["temp"] = pd.read_csv(fi, index_col=0)["probability"].astype(float)
            X = sp.hstack((X, sp.csr_matrix(np.nan_to_num(prtrain[["temp"]].values))))
    
    print "Shape:", X.shape
    y = prtrain["isDuplicate"].values
    del prtrain
    
#     param_grid = ParameterGrid({
#             "n_estimators": [1, 5, 10],
#             "max_depth": [1, 2, 3, 5],
#         })
# #     param_grid = ParameterGrid({
# #             "alpha": [0.0001, 0.001, 0.01, 0.1, 1.0],
# #             "loss": ["log", "modified_huber"],
# #         })
# #     param_grid = ParameterGrid({
# #             "C": [0.0001, 0.001, 0.01, 0.1, 1.0],
# #         })
# #     param_grid = ParameterGrid({
# #             "C": [1.0],
# #         })
    
#     scores = []
#     train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
#     for params in param_grid:
#         print params
#         est = XGBClassifier(**params) # {'n_estimators': 10, 'max_depth': 2}
# #         est = SGDClassifier(**params) # defaults
# #         est = LogisticRegression(**params) # defaults 
# #         est = LinearSVC(**params)
#         est.fit(train_x, train_y)
#         print classification_report(test_y, est.predict(test_x))
#         score = get_scorer("roc_auc")(est, test_x, test_y)
#         print score
#         scores.append({"params": params, "score": score})
    
#     scores = sorted(scores, key=lambda x: x["score"], reverse=True)
#     for score in scores:
#         print score
    
#     exit()
    est = Pipeline([
            ('estimator', Pipeline([
                        ("filter", VarianceThreshold()),
#                         ("scaler", StandardScaler(with_mean=False)),
                        ("est", LogisticRegression())])),
#             ('estimator', XGBClassifier(n_estimators=50, max_depth=2)),
        ])

    est.fit(X, y)
    del X
    del y

    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)[[]]
    with gzip.open("/tmp/avito.csv.gz") as fi:
        prtest["temp"] = pd.read_csv(fi, index_col=0)["probability"]
        X = sp.csr_matrix(prtest[["temp"]].values.astype(float))

#     with gzip.open("/tmp/avito-linear-test.csv.gz") as fi:
#         prtest["temp"] = pd.read_csv(fi, index_col=0)["probability"]
#         X = sp.hstack((X, sp.csr_matrix(np.nan_to_num(prtest[["temp"]].values))))

    for genmethod in (1, 3):
        print "METHOD: %d" % genmethod
        with gzip.open("/tmp/avito-gen-%d-test.csv.gz" % genmethod) as fi:
            prtest["temp"] = pd.read_csv(fi, index_col=0)["probability"].astype(float)
            X = sp.hstack((X, sp.csr_matrix(np.nan_to_num(prtest[["temp"]].values))))
    prtest["probability"] = est.predict_proba(X)[:,1]
    del X
    
    prtest.sort_index(inplace=True)
    with gzip.open("/tmp/avito-genmerge.csv.gz", "wb") as fo:
        prtest[["probability"]].to_csv(fo)


def cattest():
    import gzip
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.utils import shuffle
    from xgboost import XGBClassifier
    
    prtrain = pd.read_csv("data/ItemPairs_train.csv")
    prtest = pd.read_csv("data/ItemPairs_test.csv", index_col=0)

    start_time = datetime.datetime.now()
    
    indices = shuffle(prtrain.index, random_state=42)
#     indices = shuffle(get_train_indices("categoryID"), random_state=42)

    est = MetaCategoryEstimator(
        indices,
        prtest.index,
        [
            (["pair_imagesdist_Correlation_%d" % _], VarianceThreshold())
            for _ in [95]
        ] + [
            (["pair_imagesdist_Chi-Squared_%d" % _], VarianceThreshold())
            for _ in [5]
        ] + [
            (["pair_imagesdist_Hellinger_%d" % _], VarianceThreshold())
            for _ in [5]
        ] + [
            (["pair_imagesdist_Intersection_%d" % _], VarianceThreshold())
            for _ in [230]
        ] + [
            (["title"], SwitchTransformer(est=CountVectorizer(analyzer="char_wb", ngram_range=(1,5), binary=True), preaction="ravel", postaction=["and", "xor"])),
            (["description"], SwitchTransformer(est=CountVectorizer(binary=True), preaction="ravel", postaction=["and", "xor"])),
            (["attrsJSON"], Pipeline([
                        ("tr1", JsonToDict(1)),
                        ("tr2", JsonAttrsTrf(1)),
                        ("imputer", ConstantCol()),
                    ])),
            (["locationID"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["regionID"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["metroID"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["price"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["pair_imagesdist_Exact"], VarianceThreshold()),
            ([], Pipeline([("scaler", StandardScaler(with_mean=False)), ("est", LogisticRegression(C=0.01))])),
#             ([], XGBClassifier(n_estimators=100, max_depth=3)),
        ],
        [
            (["pair_imagesdist_Correlation_%d" % _], VarianceThreshold())
            for _ in [90]
        ] + [
            (["pair_imagesdist_Chi-Squared_%d" % _], VarianceThreshold())
            for _ in [10]
        ] + [
            (["pair_imagesdist_Hellinger_%d" % _], VarianceThreshold())
            for _ in [10]
        ] + [
            (["pair_imagesdist_Intersection_%d" % _], VarianceThreshold())
            for _ in [200]
        ] + [
            (["title"], SwitchTransformer(est=CountVectorizer(binary=True), preaction="ravel", postaction=["and", "xor"])),
            (["description"], SwitchTransformer(est=CountVectorizer(binary=True), preaction="ravel", postaction=["and", "xor"])),
            (["attrsJSON"], Pipeline([
                        ("tr1", JsonToDict(1)),
                        ("tr2", JsonAttrsTrf(1)),
                        ("imputer", ConstantCol()),
                    ])),
            (["locationID"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["regionID"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["metroID"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["price"], Pipeline([
                        ("binrz", EqNonEqBinarizer()),
                    ])),
            (["locationID", "lat", "lon"], LocationDist(list(np.logspace(-3.5, 6, 20, base=2)))),
            (["regionID", "lat", "lon"], LocationDist(list(np.logspace(-2, 7, 20, base=2)))),
            (["metroID", "lat", "lon"], LocationDist(list(np.logspace(-4, 4, 5, base=2)))),
            (["lat", "lon"], CoordDist()),
            (["categoryID"], Pipeline([
                        ('binrz', EqNonEqBinarizer()),
                        ('filter', VarianceThreshold()),
                    ])),
            ([], Pipeline([("scaler", StandardScaler(with_mean=False)), ("est", LogisticRegression(C=1.0))])),
        ], None
    )
    est.fit(indices.values.reshape(-1,1), prtrain.loc[indices, "isDuplicate"].values)
    
    prtest["probability"] = est.predict_proba(prtest.index.values.reshape(-1,1))[:,1]
    with gzip.open(os.getenv("RESFILE", "/tmp/avito.csv.gz"), "wb") as f:
        prtest[["probability"]].to_csv(f, float_format="%.7f")

    print "TEST:", (datetime.datetime.now() - start_time)

    return


def _lintest():
    os.environ["RESTRAIN"] = "/tmp/avito-linear-train.csv.gz"
    os.environ["RESTEST"] = "/tmp/avito-linear-test.csv.gz"
    newtest(
        trainidx=shuffle(get_train_indices("all"), random_state=42),
        fitidx=shuffle(get_train_indices(), random_state=42),
        feats=_get_features(linear=True),
        est=LogisticRegression()
    )


if __name__ == "__main__":
    import sys
    import warnings
    warnings.simplefilter("ignore", DeprecationWarning)
    if len(sys.argv) > 1 and sys.argv[-1] == "test":
        newtest()
    elif len(sys.argv) > 1 and sys.argv[-1] == "cattest":
        _cattest()
    elif len(sys.argv) > 1 and sys.argv[-1] == "pcattest":
        _pcattest()
    elif len(sys.argv) > 1 and sys.argv[-1] == "gentest":
        _gentest()
    elif len(sys.argv) > 1 and sys.argv[-1] == "lintest":
        _lintest()
    elif len(sys.argv) > 1 and sys.argv[-1] == "catmerge":
        _catmerge()
    elif len(sys.argv) > 1 and sys.argv[-1] == "pcatmerge":
        _pcatmerge()
    elif len(sys.argv) > 1 and sys.argv[-1] == "newcatmerge":
        _newcatmerge()
    elif len(sys.argv) > 1 and sys.argv[-1] == "genmerge":
        _genmerge()
    else:
        print newcv(5).mean()
