In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh.plotting
import catboost
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.preprocessing, sklearn.feature_selection, sklearn.model_selection
from sklearn.pipeline import Pipeline, make_pipeline
import sklearn.base
from sklearn.base import BaseEstimator, TransformerMixin
import seaborn as sns
import dateutil.parser
import collections
import sklearn.utils
import itertools
import re

import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

SEED = 42
np.random.seed = SEED

%matplotlib inline
%load_ext Cython

<IPython.core.display.Javascript object>

In [2]:
%%cython
cimport numpy as np

cpdef float qtof_(x):
    qtys = {
        'enough': 1.0,
        'insufficient': .6,
        'seasonal': .4,
        'dry': .2,
        'unknown': 0,
    }
    if x in qtys:
        return qtys[x]
    return 0

In [3]:
class LabelencodeAll:
    def __init__(self, cols):
        self.cols = cols
        self.encs_ = collections.defaultdict(sklearn.preprocessing.LabelEncoder)
    def fit(self, X, y=None):
        for c in self.cols:
            if c not in X.columns: continue
            if not np.issubdtype(X[c].dtype, np.number):
                self.encs_[c].fit(X[c])
        return self
    def transform(self, X):
        X = X.copy()
        for c, enc in self.encs_.items():
            if c not in X.columns: continue
            if not c in X.columns: continue
            if not np.issubdtype(X.loc[:,c].dtype, np.number):
                X.loc[:,c] = enc.transform(X.loc[:,c])
        return X

class DecodeQuantity(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X.loc[:,self.col] = X.loc[:,self.col].map(qtof_)
        return X
    
    
class HideMissingValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for c in X.columns:
            if np.issubdtype(X.loc[:,c].dtype, np.number):
                med = X.loc[:,c].median()
                if np.isnan(med):
                    med = 0
                X.loc[:,c].fillna(med, inplace=True)
            elif set(X.loc[:,c].unique()).issubset({True, False, np.nan}):
                X.loc[X.loc[:,c] == 1, c] = 1
                X.loc[X.loc[:,c] != 1, c] = 0
                X.loc[:,c].fillna(0.5, inplace=True)
            else:
                X.loc[:,c].fillna('unknown', inplace=True)
                X.loc[:,c] = X.loc[:,c].astype('str')
        return X
    

class DropGarbage(BaseEstimator, TransformerMixin):
    def __init__(self, cols=['id', 'funder', 'recorded_by', 'date_recorded']):
        self.cols = cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.cols, axis=1, errors='ignore')

class SplitDate(BaseEstimator, TransformerMixin):
    def __init__(self, col, drop=True):
        self.col = col
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        def t(x):
            if isinstance(x, str):
                try:
                    x = dateutil.parser.parse(x)
                except:
                    return (np.nan, np.nan, np.nan, np.nan)
            elif pd.isnull(x):
                return (np.nan, np.nan, np.nan, np.nan)
            return (x.year, x.month, x.day, x.weekday())
        X = X.copy()
        (X.loc[:, self.col + '_year'],
         X.loc[:, self.col + '_month'],
         X.loc[:, self.col + '_day'],
         X.loc[:, self.col + '_weekday']) = zip(
        *X.loc[:, self.col].map(t))
        if self.drop:
            X.drop(self.col, axis=1, inplace=True)
        return X

    
class HandyFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X.loc[:,'amount_per_man'] = X.loc[:,'amount_tsh']/X.loc[:,'population']
        return X

class OutcomeFrequences(BaseException, TransformerMixin):
    def __init__(self, groupby, drop=False):
        self.groupby = groupby
        self.drop = drop
        self.cnts_ = None
        self.fq_cols_ = None
        self.unknowns_ = 0
        
    def fit(self, X, y=None):
        possible_outcomes = [_.split()[-1] if isinstance(_, str) else _ for _ in np.unique(y)]
        n_outs = len(possible_outcomes)
        self.cnts_ = collections.defaultdict(lambda: np.zeros(n_outs))
        # print(self.groupby)
        igroupby = np.array([X.columns.get_loc(_) for _ in self.groupby])
        for i in range(X.shape[0]):
            # assuming $y$ is label-encoded
            keys, out = tuple(X.iloc[i, igroupby].values), y[i]
            self.cnts_[keys][out] += 1 # no of `(keys, out)` occurences
        for k in self.cnts_:
            self.cnts_[k] //= self.cnts_[k].sum()
            
        self.possible_outcomes_ = possible_outcomes
        self.fq_cols_ = ['_'.join(self.groupby + [str(out), 'fq'],) for out in possible_outcomes]
        return self
    
    def transform(self, X):
        self.unknowns_ = 0
        igroupby = np.array([X.columns.get_loc(_) for _ in self.groupby])
        new_cols = [tuple([None for out in self.possible_outcomes_]) for _ in range(X.shape[0])]
        for i in range(X.shape[0]):
            keys = tuple(X.iloc[i, igroupby].values)
            if keys in self.cnts_:
                new_cols[i] = tuple([self.cnts_[keys][out] for out in self.possible_outcomes_])
            else:
                self.unknowns_ += 1
        self.unknowns_ /= X.size
        # print(str(X)[:100])
        # print(str(new_cols)[:100])
        # print(str(self.fq_cols_[:100]))
        new_cols = pd.DataFrame(new_cols, columns=self.fq_cols_)
        for c in self.fq_cols_:
            new_cols.loc[:,c].fillna(0, inplace=True) # new_cols.loc[:,c].median(), inplace=True)
        X = pd.concat((X, new_cols,), axis=1)
        if self.drop:
            X.drop(self.groupby, axis=1, inplace=True, errors='ignore')
        return X

class ExplicitAge(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X.loc[:, 'age'] = X.loc[:,'date_recorded_year'] - X.loc[:,'construction_year']
        return X

def show_intersections(A, B, add_cat_feats=['date_recorded', 'construction_year',
                                            'date_recorded_year', 'date_recorded_month']):
    cat_features = np.where(np.array([(X_train[c].dtype == object)
                                  or (c.endswith('code'))
                                  or (c in add_cat_feats) for c in X_train.columns.values]))[0]
    cat_feats_observed = []
    for c in cat_features:
        n_uni = len(set(A.iloc[:,c]).union(B.iloc[:,c]))
        n_int = len(set(A.iloc[:,c]).intersection(B.iloc[:,c]))
        n_te = len(set(A.iloc[:,c]))
        cat_feats_observed.append((A.columns[c], n_int/n_uni, n_int/n_te))
    cat_feats_observed = (
        pd.DataFrame(cat_feats_observed, columns=['var', 'intersection/union', 'intersection/B'])
        .sort_values('intersection/B')
    )
    return cat_feats_observed

        
def split_train_cv_test(X, y, proportions=(.75, .25/2, .25/2)):
    # expecting $y$ to be numpy array
    outs = np.unique(y)
    proportions = np.array(proportions)
    classes = [np.where(y == i)[0] for i in outs]
    xparts = [[] for _ in proportions]
    yparts = [[] for _ in proportions]
    for cidx in classes:
        cidx = sklearn.utils.shuffle(cidx)
        cprops = cidx.size * proportions
        cprops = cprops.astype(int)
        cprops[-1] = cidx.size - cprops[:-1].sum()
        # print(cidx)
        cx = X.iloc[cidx,:]
        cy = y[cidx]
        for xpart, ypart, sz in zip(xparts, yparts, cprops):
            xpart.append(cx.iloc[:sz,:])
            ypart.append(cy[:sz])
            cx, cy = cx.iloc[sz:,:], cy[sz:]
    xparts = [pd.concat(xpart) for xpart in xparts]
    yparts = [np.concatenate(ypart) for ypart in yparts]
    parts = xparts + yparts
    return parts
def cat_features_indices(X):
    return np.where(np.array([(X[c].dtype == object)
        or (c.endswith('code'))                                                  
    for c in X.columns.values]))[0]

def cat_features_names(X):
    return X.columns[cat_features_indices(X)]

def copy_and_reset(X, cols, cat_unknown='unknown', num_unknown=0):
    X = X.copy()
    for c in cols:
        if ((X[c].dtype == object)
            or (c.endswith('code'))
            or ('date' in c and '_fq' not in c)):
            X.loc[:,c] = cat_unknown
        else:
            X.loc[:,c] = num_unknown
    return X

def spawn_noisy(X, y, colsets_to_noise):
    X = X.drop('id', axis=1, errors='ignore').copy()
    X = pd.concat([copy_and_reset(X, cols).reset_index(drop=True)
                  for cols in colsets_to_noise],
                 ignore_index=True)
    y = np.concatenate([y
                  for cols in colsets_to_noise])
    return X, y

In [4]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
show_intersections(X_train, X_test)
display(X_train.shape, y_train.shape, X_test.shape)

(59400, 40)

(59400, 2)

(14850, 40)

In [6]:
xtr.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [22]:
class PrefitedPipeline:
    def __init__(self, steps, verbose=1):
        self.steps = steps
        self.verbose = verbose
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.verbose > 2:
            print('Input columns: %s' % X.columns)
        for stepname, step in self.steps:
            if self.verbose > 0:
                print('Executing %s' % stepname)
            X = step.transform(X)
            if self.verbose > 2:
                print('Step %s output columns: %s' % (stepname,
                                                      X.columns if isinstance(X, pd.DataFrame)
                                                      else type(X)))
        return X

In [7]:
y_enc = sklearn.preprocessing.LabelEncoder()
y_tr = y_enc.fit_transform(y_train.iloc[:,1])
xtr, xcv, xva, ytr, ycv, yva = split_train_cv_test(X_train, y_tr, proportions=(.7, .15, .15))
xtr, ytr = spawn_noisy(xtr, ytr, cols_to_reset)
prepr0.fit(xtr.append(), ytr)


cols_to_reset = [
    [],
    ['wpt_name'],
    ['wpt_name', 'installer', 'funder'],
    ['wpt_name', 'subvillage'],
    ['wpt_name', 'subvillage'],
    ['wpt_name', 'subvillage', 'installer', 'funder' ],
    ['wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name',
     'ward', 'region_code', ],
    ['wpt_name', 'subvillage', 'installer', 'funder', 'scheme_management',
     'ward', 'region_code', 'date_recorded'],
    ['funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'public_meeting', 'recorded_by', 'scheme_name', 'payment', 'payment_type',],
    ['amount_tsh', 'funder', 'installer', 'date_recorded', 'construction_year',
     'longitude', 'latitude', 'wpt_name', 'num_private',
       'subvillage', 'region_code', 'district_code',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_name', 'extraction_type', 'management', 'management_group',
     'payment', 'payment_type',],
    ['funder', 'installer', 'wpt_name', 'num_private',
       'subvillage', 'region_code', 'district_code', 
       'public_meeting', 'recorded_by',
      'payment', 'payment_type',],
    ['funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region',
       'ward', 'public_meeting', 'recorded_by',
     'scheme_name', 'payment', 'payment_type',],
    ['funder', 'installer', 'wpt_name', 'num_private',
       'subvillage', 'region_code', 'district_code', 
       'public_meeting', 'recorded_by',
      'payment', 'payment_type',],
    ['funder', 'installer', 'wpt_name', 'num_private',
       'subvillage', 'region_code', 'district_code', 
       'public_meeting', 'recorded_by',
      'payment', 'payment_type',],
]
# Step 1, preprocessing
prepr0 = PrefitedPipeline([
    ('qty', DecodeQuantity('quantity')),
    ('date_recorded', SplitDate('date_recorded', drop=True)),
    ('age', ExplicitAge()),
    ('fillna', HideMissingValues()),
    ('handyfeats', HandyFeatures()),
    ('fillna2', HideMissingValues()),
    ('labelenc', LabelencodeAll(cols=cat_features_names(X_train))),
] + [
    (catcol + '_fq', OutcomeFrequences([catcol], drop=False))
    for catcol in ['installer', 'subvillage', 'region', 'basin','date_recorded_month',
                   'extraction_type_class',]
] + [
    ('drop', DropGarbage(cols=[
        'id', 'recorded_by', 'num_private', 'date_recorded_year', 'date_recorded_day'
    ]))
])
xtr = prepr0.transform(xtr)

In [8]:
display('sizes: ', xtr.index, y_tr.size)

'sizes: '

RangeIndex(start=0, stop=582092, step=1)

59400

In [9]:
X_train_columns = X_train.columns
X_train_ids = X_train['id']
del X_train
del y_train

In [21]:
for c in cat_features_names(xcv):
    if c not in xtr.columns: continue
    cvals = set(xtr[c].unique())
    # print('c: %s' % c)
    def fill_cat(x):
        return (x if x in cvals and not np.isnan(x)
        else (0 if np.issubdtype(type(x), np.number) else 'unknown'))
    xcv.loc[:,c] = xcv.loc[:,c].map(fill_cat)
xcv, ycv = spawn_noisy(xcv, ycv, cols_to_reset)
xcv = prepr0.transform(xcv)

c: funder
c: installer
c: wpt_name
c: basin
c: subvillage
c: region
c: region_code
c: district_code
c: lga
c: ward
c: public_meeting
c: scheme_management
c: scheme_name
c: permit
c: extraction_type
c: extraction_type_group
c: extraction_type_class
c: management
c: management_group
c: payment
c: payment_type
c: water_quality
c: quality_group
c: quantity
c: quantity_group
c: source
c: source_type
c: source_class
c: waterpoint_type
c: waterpoint_type_group


MemoryError: 

In [None]:
for c in cat_features_names(xva):
    if c not in xtr.columns: continue
    cvals = set(xtr[c].unique())
    xva.loc[:,c] = xva.loc[:,c].map(lambda x:
                                    x if x in cvals and pd.notnull(x)
                                    else (0 if np.issubdtype(x, np.number) else 'unknown'))
xva, yva = spawn_noisy(xva, yva, cols_to_reset)
xva = prepr0.transform(xva)

In [None]:
cat_feats = [xtr.columns.get_loc(c) for c in [
    'funder', 'installer', 'wpt_name', 'basin', 'subvillage',
    'region', 'region_code', 'district_code', 'lga', 'ward',
    'extraction_type',
    'management', 'management_group', 'payment', 'payment_type', 'water_quality',
    'quality_group', 'source', 'source_type', 'source_class',
    'waterpoint_type', 'waterpoint_type_group'
]]

In [None]:
clf = catboost.CatBoostClassifier(iterations=100,
                                  loss_function='MultiClass',
                                  eval_metric='Accuracy',
                                  calc_feature_importances=True)
clf.fit(xtr, ytr, eval_set=(xcv, ycv),
        cat_features=cat_feats,
        verbose=True)

In [None]:
display(clf.score(xtr, ytr),
        clf.score(xcv, ycv),
        clf.score(xva, yva))

In [None]:
orig_len = 59400 # X_train.shape[0]
display(clf.score(xcv.iloc[:orig_len,:], ycv[:orig_len]),)

In [None]:
X_te = X_test.copy()
for c in cat_features_names(X_te):
    if c not in xtr.columns: continue
    cvals = set(xtr[c].unique())
    X_te.loc[:,c] = X_te.loc[:,c].map(lambda x:
                                    x if x in cvals and pd.notnull(x)
                                    else (0 if np.issubdtype(x, np.number) else 'unknown'))
# X_te.loc[:,'date_recorded'] = 'Unknown'
X_te = prepr0.transform(X_te.drop('id', axis=1))
y_te = y_enc.inverse_transform(clf.predict(X_te).astype(int))

In [None]:
y_te

In [None]:
ans = pd.DataFrame({'status_group': y_te.ravel()}, index=X_test['id'])
ans.index.name = 'id'
ans.to_csv('ans.csv')

In [None]:
!head ans.csv