# Learning from the experts

## Learning from the expert: processing

### Deciding what's a word

In [2]:
def multilabel_sample(y, size=1000, min_count=3, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).any():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]


def multilabel_train_test_split(X, Y, size, min_count=3, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])


import pickle
import pandas as pd
import numpy as np
f = open("index.list", "rb")
index = pickle.load(f)
f.close()
df = pd.read_csv("TrainingData.csv", index_col=0)
df = df.loc[index]
NUMERIC_COLUMNS = ['FTE', 'Total']
LABELS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']
NON_LABELS = [c for c in df.columns if c not in LABELS]

dummy_labels = pd.get_dummies(df[LABELS])


def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    text_data.fillna("",inplace=True)
    return text_data.apply(lambda x: " ".join(x), axis=1)

X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
                                                               dummy_labels,
                                                               0.2, 
                                                               seed=123)
from sklearn.feature_extraction.text import CountVectorizer
text_vector = combine_text_columns(X_train)

TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
text_features = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)
text_features.fit(text_vector)
print(text_features.get_feature_names()[:10])

['00a', '12', '1st', '2nd', '4th', '5th', '70h', '8', 'a', 'aaps']


### N-gram range in scikit-learn

The dim_red step uses a scikit-learn function called SelectKBest(), applying something called the chi-squared test to select the K "best" features. The scale step uses a scikit-learn function called MaxAbsScaler() in order to squash the relevant features into the interval -1 to 1.



In [4]:
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import chi2, SelectKBest

# Select 300 best features
chi_k = 300

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


In [5]:
pl.fit(X_train, y_train)

BOX_PLOTS_COLUMN_INDICES = [range(0, 37),
 range(37, 48),
 range(48, 51),
 range(51, 76),
 range(76, 79),
 range(79, 82),
 range(82, 87),
 range(87, 96),
 range(96, 104)]
def _multi_multi_log_loss(predicted,
                          actual,
                          class_column_indices=BOX_PLOTS_COLUMN_INDICES,
                          eps=1e-15):
    """ Multi class version of Logarithmic Loss metric as implemented on
    DrivenData.org
    """
    class_scores = np.ones(len(class_column_indices), dtype=np.float64)
    
    # calculate log loss for each set of columns that belong to a class:
    for k, this_class_indices in enumerate(class_column_indices):
        # get just the columns for this class
        preds_k = predicted[:, this_class_indices].astype(np.float64)
        
        # normalize so probabilities sum to one (unless sum is zero, then we clip)
        preds_k /= np.clip(preds_k.sum(axis=1).reshape(-1, 1), eps, np.inf)

        actual_k = actual[:, this_class_indices]

        # shrink predictions so
        y_hats = np.clip(preds_k, eps, 1 - eps)
        sum_logs = np.sum(actual_k * np.log(y_hats))
        class_scores[k] = (-1.0 / actual.shape[0]) * sum_logs
        
    return np.average(class_scores)
def score_submission(pred_path="./", holdout_path='LabelData.csv'):
    # this happens on the backend to get the score
    holdout_labels = pd.get_dummies(pd.read_csv(holdout_path, index_col=0).apply(lambda x: x.astype('category'), axis=0))

    preds = pd.read_csv(pred_path, index_col=0)
    
    # make sure that format is correct
    assert (preds.columns == holdout_labels.columns).all()
    assert (preds.index == holdout_labels.index).all()

    return _multi_multi_log_loss(preds.values, holdout_labels.values)

act_holdout = pd.read_csv("act_HoldoutData.csv", index_col=0)
predictions = pl.predict_proba(act_holdout[NON_LABELS])
# don't need to fillna's since we applied SimpleImputer in the pipeline.

prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns, index=act_holdout.index, 
                             data = predictions)
prediction_df.to_csv("predictions2.csv")
score = score_submission(pred_path ="predictions2.csv")
print('Your model, trained with numeric data only, yields logloss score: {}'.format(score))
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)

Your model, trained with numeric data only, yields logloss score: 1.3604676692550002

Accuracy on budget dataset:  0.22435897435897437


## Learning from the expert: a stats trick

Since CountVectorizer creates sparse matrix SparseInteractions is used because PolynomialFeatures does not support sparse matrices. It is used to describe when tokens appear together.

$$ \beta_1 x_1 + \beta_2 x_2 + \beta_3 (x_1 \times x_2) $$


### Implement interaction modeling in scikit-learn

In [6]:
from itertools import combinations

import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin


class SparseInteractions(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2, feature_name_separator="_"):
        self.degree = degree
        self.feature_name_separator = feature_name_separator

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not sparse.isspmatrix_csc(X):
            X = sparse.csc_matrix(X)

        if hasattr(X, "columns"):
            self.orig_col_names = X.columns
        else:
            self.orig_col_names = np.array([str(i) for i in range(X.shape[1])])

        spi = self._create_sparse_interactions(X)
        return spi

    def get_feature_names(self):
        return self.feature_names

    def _create_sparse_interactions(self, X):
        out_mat = []
        self.feature_names = self.orig_col_names.tolist()

        for sub_degree in range(2, self.degree + 1):
            for col_ixs in combinations(range(X.shape[1]), sub_degree):
                # add name for new column
                name = self.feature_name_separator.join(self.orig_col_names[list(col_ixs)])
                self.feature_names.append(name)

                # get column multiplications value
                out = X[:, col_ixs[0]]
                for j in col_ixs[1:]:
                    out = out.multiply(X[:, j])

                out_mat.append(out)

        return sparse.hstack([X] + out_mat)


In [7]:
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1, 2))),  
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ("int", SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000)))
    ])
pl.fit(X_train, y_train)


act_holdout = pd.read_csv("act_HoldoutData.csv", index_col=0)
predictions = pl.predict_proba(act_holdout[NON_LABELS])
# don't need to fillna's since we applied SimpleImputer in the pipeline.


prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns, index=act_holdout.index, 
                             data = predictions)
prediction_df.to_csv("predictions3.csv")
score = score_submission(pred_path ="predictions3.csv")
print('Your model, trained with numeric data only, yields logloss score: {}'.format(score))
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)



Your model, trained with numeric data only, yields logloss score: 1.3193666283220828

Accuracy on budget dataset:  0.3525641025641026


# To clear up the confusion

The outputs of the below data can be helpful to understand the what both CountVectorizer and SparseInteractions (or PolynomialFeatures) do.

In [8]:

d = {"c1":["This is a sentence is", "This is another sentence", "third document is here"]}
df1 = pd.DataFrame(d)
print(df1)
ct = CountVectorizer(ngram_range=(1, 2))
a = ct.fit(df1["c1"])
print(a.vocabulary_)
print(a.get_feature_names())
names = a.get_feature_names()
a = ct.transform(df1["c1"])
print(a.shape)
print(a.toarray())

df2 = pd.DataFrame(a.toarray(), columns=names)
print(df2)

sp = SparseInteractions(degree=2)
values = sp.fit_transform(a).toarray()
names = sp.fit(a).get_feature_names()
df3 = pd.DataFrame(values, columns=names)
print(df3)

                         c1
0     This is a sentence is
1  This is another sentence
2    third document is here
{'this': 13, 'is': 5, 'sentence': 9, 'this is': 14, 'is sentence': 8, 'sentence is': 10, 'another': 0, 'is another': 6, 'another sentence': 1, 'third': 11, 'document': 2, 'here': 4, 'third document': 12, 'document is': 3, 'is here': 7}
['another', 'another sentence', 'document', 'document is', 'here', 'is', 'is another', 'is here', 'is sentence', 'sentence', 'sentence is', 'third', 'third document', 'this', 'this is']
(3, 15)
[[0 0 0 0 0 2 0 0 1 1 1 0 0 1 1]
 [1 1 0 0 0 1 1 0 0 1 0 0 0 1 1]
 [0 0 1 1 1 1 0 1 0 0 0 1 1 0 0]]
   another  another sentence  document  document is  here  is  is another  \
0        0                 0         0            0     0   2           0   
1        1                 1         0            0     0   1           1   
2        0                 0         1            1     1   1           0   

   is here  is sentence  sentence  sentence is  t

## Learning from the expert: the winning model

A balance adding new features with the computational cost of additional columns is needed. 3-grams, 4-grams will have an enormous increase in the size of the array. As the array grows in size the more computational power is needed to fit the model. Hashing trick is a way of limiting the size of matrix without losing too much model accuracy.

A hash function takes an input and outputs a hash value. Output can be limited. Some columns will have multiple columns that map to them. -> Dimensionality reduction.

Instead of using the CountVectorizer that creates the bag of words representation, HashingVectorizer can be used.

### Implementing the hashing trick in scikit-learn

In [13]:
from sklearn.feature_extraction.text import HashingVectorizer

text_data = combine_text_columns(X_train)
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
hashing_vec = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC)
hashed_text = hashing_vec.fit_transform(text_data)
hashed_df = pd.DataFrame(hashed_text.data)
hashed_df.head()

Unnamed: 0,0
0,-0.160128
1,0.160128
2,-0.480384
3,-0.320256
4,0.160128


### Build the winning model

In [16]:
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ("vectorizer", HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     alternate_sign=False, norm=None, binary=False,
                                                     ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
pl.fit(X_train, y_train)
predictions = pl.predict_proba(act_holdout[NON_LABELS])
# don't need to fillna's since we applied SimpleImputer in the pipeline.


prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns, index=act_holdout.index, 
                             data = predictions)
prediction_df.to_csv("predictions4.csv")
score = score_submission(pred_path ="predictions4.csv")
print('Your model, trained with numeric data only, yields logloss score: {}'.format(score))
# computing accuracy doesn't make sense



Your model, trained with numeric data only, yields logloss score: 1.3179052059738843


Looks like the performance is about the same, but this is expected since the HashingVectorizer should work the same as the CountVectorizer.

In [18]:
prediction_df

Unnamed: 0,Function_Aides Compensation,Function_Career & Academic Counseling,Function_Communications,Function_Curriculum Development,Function_Data Processing & Information Services,Function_Development & Fundraising,Function_Enrichment,Function_Extended Time & Tutoring,Function_Facilities & Maintenance,Function_Facilities Planning,...,Object_Type_Rent/Utilities,Object_Type_Substitute Compensation,Object_Type_Supplies/Materials,Object_Type_Travel & Conferences,Pre_K_NO_LABEL,Pre_K_Non PreK,Pre_K_PreK,Operating_Status_Non-Operating,"Operating_Status_Operating, Not PreK-12",Operating_Status_PreK-12 Operating
237,0.010336,0.001350,0.000950,0.002223,0.001012,0.001552,0.036873,0.002458,0.002052,0.000632,...,0.002558,0.011131,0.003388,0.003058,0.008551,0.991075,0.002052,0.002629,0.002805,0.996208
466,0.008051,0.007658,0.003251,0.024367,0.005281,0.011267,0.322427,0.012399,0.010652,0.001297,...,0.019754,0.004455,0.070853,0.022528,0.855529,0.153876,0.005274,0.049759,0.022046,0.900228
784,0.002439,0.004282,0.002263,0.010340,0.006531,0.005756,0.012235,0.007307,0.014390,0.001135,...,0.012697,0.894072,0.011019,0.009794,0.986357,0.011677,0.005030,0.009602,0.004888,0.989973
1786,0.004543,0.007863,0.003393,0.035710,0.005777,0.012877,0.099831,0.016628,0.015982,0.001394,...,0.040161,0.006948,0.066109,0.035574,0.891680,0.090272,0.007728,0.040823,0.011226,0.944377
2643,0.006274,0.005827,0.004016,0.070786,0.006112,0.010107,0.170501,0.007108,0.016798,0.001416,...,0.020751,0.004496,0.051330,0.024462,0.675536,0.317759,0.006514,0.018316,0.009561,0.975119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272864,0.010935,0.011039,0.002428,0.060812,0.003559,0.007837,0.076758,0.014596,0.016202,0.001319,...,0.022875,0.005079,0.040123,0.019194,0.951532,0.056505,0.004628,0.037189,0.023021,0.916348
448362,0.001055,0.000879,0.000367,0.000735,0.000593,0.004705,0.000884,0.001197,0.000550,0.000419,...,0.000591,0.000723,0.000280,0.000788,0.997210,0.002516,0.001005,0.997258,0.000630,0.003557
181209,0.052098,0.005101,0.002655,0.029381,0.008218,0.004262,0.006723,0.004204,0.006792,0.001427,...,0.006913,0.002779,0.016820,0.006454,0.902857,0.061477,0.008899,0.026660,0.036588,0.897919
364039,0.006452,0.012982,0.002915,0.021123,0.004712,0.009823,0.145637,0.017952,0.014522,0.001404,...,0.021685,0.005034,0.079683,0.029895,0.878265,0.113958,0.005429,0.031535,0.017366,0.943719
