In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh.plotting
import keras, keras.models, keras.layers
import sklearn.preprocessing, sklearn.feature_selection, sklearn.model_selection
from sklearn.pipeline import Pipeline, make_pipeline
import sklearn.base
from sklearn.base import BaseEstimator, TransformerMixin
import seaborn as sns
import dateutil.parser
import collections
import sklearn.utils
import itertools
import re

import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

SEED = 42
np.random.seed = SEED

%matplotlib inline
%load_ext Cython

Using TensorFlow backend.


<IPython.core.display.Javascript object>

In [2]:
%%cython
cimport numpy as np

cpdef float qtof_(x):
    qtys = {
        'enough': 1.0,
        'insufficient': .6,
        'seasonal': .4,
        'dry': .2,
        'unknown': 0,
    }
    if x in qtys:
        return qtys[x]
    return 0

In [3]:
class LabelencodeAll:
    def __init__(self, cols):
        self.cols = cols
        self.encs_ = collections.defaultdict(sklearn.preprocessing.LabelEncoder)
    def fit(self, X, y=None):
        for c in self.cols:
            if c not in X.columns: continue
            if not np.issubdtype(X[c].dtype, np.number):
                self.encs_[c].fit(X[c])
        return self
    def transform(self, X):
        X = X.copy()
        for c, enc in self.encs_.items():
            if c not in X.columns: continue
            if not c in X.columns: continue
            if not np.issubdtype(X.loc[:,c].dtype, np.number):
                X.loc[:,c] = enc.transform(X.loc[:,c])
        return X

class DecodeQuantity(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X.loc[:,self.col] = X.loc[:,self.col].map(qtof_)
        return X
    
    
class HideMissingValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for c in X.columns:
            if np.issubdtype(X.loc[:,c].dtype, np.number):
                med = X.loc[:,c].median()
                if np.isnan(med):
                    med = 0
                X.loc[:,c].fillna(med, inplace=True)
            elif set(X.loc[:,c].unique()).issubset({True, False, np.nan}):
                X.loc[X.loc[:,c] == 1, c] = 1
                X.loc[X.loc[:,c] != 1, c] = 0
                X.loc[:,c].fillna(0.5, inplace=True)
            else:
                X.loc[:,c].fillna('unknown', inplace=True)
                X.loc[:,c] = X.loc[:,c].astype('str')
        return X
    

class DropGarbage(BaseEstimator, TransformerMixin):
    def __init__(self, cols=['id', 'funder', 'recorded_by', 'date_recorded']):
        self.cols = cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.cols, axis=1, errors='ignore')

class SplitDate(BaseEstimator, TransformerMixin):
    def __init__(self, col, drop=True):
        self.col = col
        self.drop = drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        def t(x):
            if isinstance(x, str):
                try:
                    x = dateutil.parser.parse(x)
                except:
                    return (np.nan, np.nan, np.nan, np.nan)
            elif pd.isnull(x):
                return (np.nan, np.nan, np.nan, np.nan)
            return (x.year, x.month, x.day, x.weekday())
        X = X.copy()
        (X.loc[:, self.col + '_year'],
         X.loc[:, self.col + '_month'],
         X.loc[:, self.col + '_day'],
         X.loc[:, self.col + '_weekday']) = zip(
        *X.loc[:, self.col].map(t))
        if self.drop:
            X.drop(self.col, axis=1, inplace=True)
        return X

    
class HandyFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X.loc[:,'amount_per_man'] = (X.loc[:,'amount_tsh']/X.loc[:,'population']).fillna(0)
        return X

class OutcomeFrequences(BaseException, TransformerMixin):
    def __init__(self, groupby, drop=False, return_only_fq=False):
        self.groupby = groupby
        self.drop = drop
        self.cnts_ = None
        self.fq_cols_ = None
        self.return_only_fq = return_only_fq
        self.unknowns_ = 0
        
    def fit(self, X, y=None):
        possible_outcomes = [_.split()[-1] if isinstance(_, str) else _ for _ in np.unique(y)]
        n_outs = len(possible_outcomes)
        self.cnts_ = collections.defaultdict(lambda: np.zeros(n_outs))
        # print(self.groupby)
        igroupby = np.array([X.columns.get_loc(_) for _ in self.groupby])
        for i in range(X.shape[0]):
            # assuming $y$ is label-encoded
            keys, out = tuple(X.iloc[i, igroupby].values), y[i]
            self.cnts_[keys][out] += 1 # no of `(keys, out)` occurences
        for k in self.cnts_:
            self.cnts_[k] //= self.cnts_[k].sum()
            
        self.possible_outcomes_ = possible_outcomes
        self.fq_cols_ = ['_'.join(self.groupby + [str(out), 'fq'],) for out in possible_outcomes]
        return self
    
    def transform(self, X):
        self.unknowns_ = 0
        igroupby = np.array([X.columns.get_loc(_) for _ in self.groupby])
        new_cols = [tuple([None for out in self.possible_outcomes_]) for _ in range(X.shape[0])]
        for i in range(X.shape[0]):
            keys = tuple(X.iloc[i, igroupby].values)
            if keys in self.cnts_:
                new_cols[i] = tuple([self.cnts_[keys][out] for out in self.possible_outcomes_])
            else:
                self.unknowns_ += 1
        self.unknowns_ /= X.size
        # print(str(X)[:100])
        # print(str(new_cols)[:100])
        # print(str(self.fq_cols_[:100]))
        new_cols = pd.DataFrame(new_cols, columns=self.fq_cols_)
        for c in self.fq_cols_:
            new_cols.loc[:,c].fillna(0, inplace=True) # new_cols.loc[:,c].median(), inplace=True)
        if self.return_only_fq:
            return new_cols
        X = pd.concat((X, new_cols,), axis=1)
        if self.drop:
            X.drop(self.groupby, axis=1, inplace=True, errors='ignore')
        return X

class ExplicitAge(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X.loc[:, 'age'] = X.loc[:,'date_recorded_year'] - X.loc[:,'construction_year']
        return X

def show_intersections(A, B, add_cat_feats=['date_recorded', 'construction_year',
                                            'date_recorded_year', 'date_recorded_month']):
    cat_features = np.where(np.array([(X_train[c].dtype == object)
                                  or (c.endswith('code'))
                                  or (c in add_cat_feats) for c in X_train.columns.values]))[0]
    cat_feats_observed = []
    for c in cat_features:
        n_uni = len(set(A.iloc[:,c]).union(B.iloc[:,c]))
        n_int = len(set(A.iloc[:,c]).intersection(B.iloc[:,c]))
        n_te = len(set(A.iloc[:,c]))
        cat_feats_observed.append((A.columns[c], n_int/n_uni, n_int/n_te))
    cat_feats_observed = (
        pd.DataFrame(cat_feats_observed, columns=['var', 'intersection/union', 'intersection/B'])
        .sort_values('intersection/B')
    )
    return cat_feats_observed

        
def split_train_cv_test(X, y, proportions=(.75, .25/2, .25/2)):
    # expecting $y$ to be numpy array
    outs = np.unique(y)
    proportions = np.array(proportions)
    classes = [np.where(y == i)[0] for i in outs]
    xparts = [[] for _ in proportions]
    yparts = [[] for _ in proportions]
    for cidx in classes:
        cidx = sklearn.utils.shuffle(cidx)
        cprops = cidx.size * proportions
        cprops = cprops.astype(int)
        cprops[-1] = cidx.size - cprops[:-1].sum()
        # print(cidx)
        cx = X.iloc[cidx,:]
        cy = y[cidx]
        for xpart, ypart, sz in zip(xparts, yparts, cprops):
            xpart.append(cx.iloc[:sz,:])
            ypart.append(cy[:sz])
            cx, cy = cx.iloc[sz:,:], cy[sz:]
    xparts = [pd.concat(xpart) for xpart in xparts]
    yparts = [np.concatenate(ypart) for ypart in yparts]
    parts = xparts + yparts
    return parts
def cat_features_indices(X):
    return np.where(np.array([(X[c].dtype == object)
        or (c.endswith('code'))                                                  
    for c in X.columns.values]))[0]

def cat_features_names(X):
    return X.columns[cat_features_indices(X)]

def copy_and_reset(X, cols, save=False, cat_unknown='unknown', num_unknown=0):
    X = X.copy()
    for c in X.columns:
        if save and c in cols:
            continue
        if not save and c not in cols:
            continue
        if X[c].dtype == object:
            X.loc[:,c] = cat_unknown
        else:
            X.loc[:,c] = num_unknown
    return X

def spawn_noisy(X, y, colsets_to_reset=[], colsets_to_save=None):
    X = X.drop('id', axis=1, errors='ignore').copy()
    colsets = colsets_to_reset if colsets_to_save is None else colsets_to_save
    X = pd.concat([copy_and_reset(X, cols,
                                  save=colsets_to_save is not None).reset_index(drop=True)
                  for cols in colsets],
                 ignore_index=True)
    y = np.concatenate([y
                  for cols in colsets])
    return X.reset_index(drop=True), y
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
display(show_intersections(X_train, X_test))
display(X_train.shape, y_train.shape, X_test.shape)
class PrefitedPipeline:
    def __init__(self, steps, verbose=1):
        self.steps = steps
        self.verbose = verbose
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.verbose > 2:
            print('Input columns: %s' % X.columns)
        for stepname, step in self.steps:
            if self.verbose > 0:
                print('Executing %s' % stepname)
            X = step.transform(X)
            if self.verbose > 2:
                print('Step %s output columns: %s' % (stepname,
                                                      X.columns if isinstance(X, pd.DataFrame)
                                                      else type(X)))
        return X
    
y_enc = sklearn.preprocessing.LabelEncoder()
ytr = y_enc.fit_transform(y_train.iloc[:,1])

preprqty = DecodeQuantity('quantity')
xtr = preprqty.fit_transform(X_train)
xte = preprqty.fit_transform(X_test)

xtr.drop(['id', 'num_private', 'recorded_by', 'wpt_name'], axis=1, errors='ignore', inplace=True)
xte.drop(['id', 'num_private', 'recorded_by', 'wpt_name'], axis=1, errors='ignore', inplace=True)

preprdate = SplitDate('date_recorded', drop=True)
xtr = preprdate.fit_transform(xtr)
xte = preprdate.transform(xte)
preprage = ExplicitAge()
xtr = preprage.fit_transform(xtr)
xte = preprage.transform(xte)

xtr.loc[:,cat_features_names(xtr)] = 'unknown'
xte.loc[:,cat_features_names(xte)] = 'unknown'

xlabelenc = LabelencodeAll(cols=cat_features_names(xtr))
xlabelenc.fit(pd.concat((xtr, xte)))
xtr = xlabelenc.transform(xtr)
xte = xlabelenc.transform(xte)

hmv_ = HideMissingValues()
xtr = hmv_.fit_transform(xtr)
xte = hmv_.transform(xte)

hf_ = HandyFeatures()
xtr = hf_.fit_transform(xtr)
xte = hf_.transform(xte)

xtr.drop(['date_recorded_day', 'date_recorded_weekday'], axis=1, errors='ignore', inplace=True)
xte.drop(['date_recorded_day', 'date_recorded_weekday'], axis=1, errors='ignore', inplace=True)

fqpreprs = [(catcol + '_fq', OutcomeFrequences([catcol], drop=False))
    for catcol in ['installer', 'subvillage', 'basin', 'extraction_type_class',
                   'date_recorded_month', 'region', 'ward', 'source_type']
]
for i in range(len(fqpreprs)):
    print('fitting %s' % fqpreprs[i][0])
    xtr = fqpreprs[i][1].fit_transform(xtr, ytr)
    print('applying %s to test' % fqpreprs[i][0])
    xte = fqpreprs[i][1].transform(xte)


xtr, xcv, xva, ytr, ycv, yva = split_train_cv_test(xtr, ytr, proportions=(.7, .15, .15))


cols_to_reset = [
    [],
    ['date_recorded_month'],
    ['subvillage'],
    [ 'installer', 'funder'],
    [ 'subvillage', 'installer', 'funder', 'date_recorded_month' ],
    [ 'subvillage', 'installer', 'funder',
     'scheme_name', 'ward', 'region_code', ],
    [ 'subvillage', 'installer', 'funder',
     'scheme_management', 'ward', 'region_code', 'date_recorded_month'],
]

cols_to_save = [
    ['age', 'amount_per_man','extraction_type',
     'water_quality', 'quality_group', 'source', 'source_type', 'source_class',
     'waterpoint_type', 'waterpoint_type_group'],
    ['date_recorded_month', 'age', 'amount_tsh', 'population',
     'amount_per_man','extraction_type', 'longtitude', 'latitude'
     'water_quality', 'quality_group', 'source', 'source_type', 'source_class',
     'waterpoint_type', 'waterpoint_type_group'],
    ['date_recorded_month', 'age', 'amount_tsh', 'population',
     'amount_per_man','extraction_type', 'longtitude', 'latitude'
     'water_quality', 'quality_group', 'source', 'source_type', 'source_class',
     'waterpoint_type', 'waterpoint_type_group'],
    ['date_recorded_month', 'age', 'amount_tsh', 'population',
     'amount_per_man','extraction_type', 'longtitude', 'latitude'
     'water_quality', 'quality_group', 'source', 'source_type', 'source_class',
     'waterpoint_type', 'waterpoint_type_group'],
    ['date_recorded_month', 'age', 'amount_tsh', 'population',
     'amount_per_man','extraction_type', 'longtitude', 'latitude'
     'water_quality', 'quality_group', 'source', 'source_type', 'source_class',
     'waterpoint_type', 'waterpoint_type_group'],
    ['date_recorded_month', 'age', 'amount_tsh', 'population',
     'amount_per_man','extraction_type', 'longtitude', 'latitude'
     'water_quality', 'quality_group', 'source', 'source_type', 'source_class',
     'waterpoint_type', 'waterpoint_type_group']
]

cbc_cat_feats = [xtr.columns.get_loc(c) for c in [
    'funder', 'installer', 'basin', 'subvillage',
    'region', 'region_code', 'district_code', 'lga', 'ward',
    'extraction_type',
    'management', 'management_group', 'payment', 'payment_type', 'water_quality',
    'quality_group', 'source', 'source_type', 'source_class',
    'waterpoint_type', 'waterpoint_type_group'
]]

xtr, ytr = [pd.concat((a.reset_index(drop=True), b.reset_index(drop=True),),
                      ignore_index=True) if isinstance(a, pd.DataFrame)
            else np.concatenate((a,b)) for a,b in zip(
    spawn_noisy(xtr, ytr, cols_to_reset),
    spawn_noisy(xtr, ytr, colsets_to_save=cols_to_save))]
                     
display('sizes: %s %s' % (xtr.index, ytr.size))

xcv, ycv = spawn_noisy(xcv, ycv, colsets_to_save=[xcv.columns, xcv.columns] + cols_to_save)

xva, yva = spawn_noisy(xva, yva, colsets_to_save=[xva.columns, xva.columns] + cols_to_save)

Unnamed: 0,var,intersection/union,intersection/B
3,wpt_name,0.05595,0.068342
5,subvillage,0.294315,0.326939
2,installer,0.343011,0.385368
1,funder,0.344699,0.38883
14,scheme_name,0.56396,0.599926
0,date_recorded,0.861789,0.893258
13,scheme_management,0.923077,0.923077
10,ward,0.930887,0.933556
17,extraction_type,0.944444,0.944444
7,region_code,0.962963,0.962963


(59400, 40)

(59400, 2)

(14850, 40)

fitting installer_fq
applying installer_fq to test
fitting subvillage_fq
applying subvillage_fq to test
fitting basin_fq
applying basin_fq to test
fitting extraction_type_class_fq
applying extraction_type_class_fq to test
fitting date_recorded_month_fq
applying date_recorded_month_fq to test
fitting region_fq
applying region_fq to test
fitting ward_fq
applying ward_fq to test
fitting source_type_fq
applying source_type_fq to test


'sizes: RangeIndex(start=0, stop=540514, step=1) 540514'

In [None]:
ybinarizer = sklearn.preprocessing.LabelBinarizer()
ytr = ybinarizer.fit_transform(ytr)
ycv = ybinarizer.transform(ycv)
yva = ybinarizer.transform(yva)


In [12]:

m = keras.models.Sequential([
    keras.layers.Dense(xtr.shape[1], input_shape=(xtr.shape[1],)),
    keras.layers.Activation('softmax'),
    keras.layers.Dense(25),
    keras.layers.Activation('softmax'),
    keras.layers.Dense(y_enc.classes_.size),
    keras.layers.Activation('softmax'),
])

scaler = sklearn.preprocessing.StandardScaler()
# xtr = scaler.fit_transform(xtr)
m.compile(loss=keras.losses.categorical_crossentropy,
          optimizer='sgd',
          metrics=['categorical_accuracy'])

In [15]:
m.fit(xtr.values, ytr, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb822e6d860>