# Creating a simple first model

## It's time to build a model


Starting with a simple model is always a good approach.

Multi-class logistic regression: treats each label column as independent. OneVsRestClassifier does that. 

### Setting up a train-test split in scikit-learn

In [17]:
import pandas as pd

df = pd.read_csv("TrainingData.csv", index_col=0)
NUMERIC_COLUMNS = ['FTE', 'Total']

LABELS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']

from warnings import warn

import numpy as np
import pandas as pd

def multilabel_sample(y, size=1000, min_count=5, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).any():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]


def multilabel_train_test_split(X, Y, size, min_count=5, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])

numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000)
label_dummies = pd.get_dummies(df[LABELS])
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only, label_dummies, size=0.2, seed=123)

print("X_train info:")
print(X_train.info())
print("\nX_test info:")  
print(X_test.info())
print("\ny_train info:")  
print(y_train.info())
print("\ny_test info:")  
print(y_test.info()) 

X_train info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 320222 entries, 134338 to 415831
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   FTE     320222 non-null  float64
 1   Total   320222 non-null  float64
dtypes: float64(2)
memory usage: 7.3 MB
None

X_test info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 80055 entries, 206341 to 72072
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   FTE     80055 non-null  float64
 1   Total   80055 non-null  float64
dtypes: float64(2)
memory usage: 1.8 MB
None

y_train info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 320222 entries, 134338 to 415831
Columns: 104 entries, Function_Aides Compensation to Operating_Status_PreK-12 Operating
dtypes: uint8(104)
memory usage: 34.2 MB
None

y_test info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 80055 entries, 206341 to 72072
Columns: 104 entries, Funct

### Training a model

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, y_train)
print("Accuracy: {}".format(clf.score(X_test, y_test)))

Accuracy: 0.0


Before adding the text data, how the model does when scored by log loss?

## Making predictions

### Use your model to predict values on holdout data

In [208]:

act_holdout = pd.read_csv("act_HoldoutData.csv", index_col=0)
predictions = clf.predict_proba(act_holdout[NUMERIC_COLUMNS].fillna(-1000))

### Writing out your results to a csv for submission

In [210]:

BOX_PLOTS_COLUMN_INDICES = [range(0, 37),
 range(37, 48),
 range(48, 51),
 range(51, 76),
 range(76, 79),
 range(79, 82),
 range(82, 87),
 range(87, 96),
 range(96, 104)]
def _multi_multi_log_loss(predicted,
                          actual,
                          class_column_indices=BOX_PLOTS_COLUMN_INDICES,
                          eps=1e-15):
    """ Multi class version of Logarithmic Loss metric as implemented on
    DrivenData.org
    """
    class_scores = np.ones(len(class_column_indices), dtype=np.float64)
    
    # calculate log loss for each set of columns that belong to a class:
    for k, this_class_indices in enumerate(class_column_indices):
        # get just the columns for this class
        preds_k = predicted[:, this_class_indices].astype(np.float64)
        
        # normalize so probabilities sum to one (unless sum is zero, then we clip)
        preds_k /= np.clip(preds_k.sum(axis=1).reshape(-1, 1), eps, np.inf)

        actual_k = actual[:, this_class_indices]

        # shrink predictions so
        y_hats = np.clip(preds_k, eps, 1 - eps)
        sum_logs = np.sum(actual_k * np.log(y_hats))
        class_scores[k] = (-1.0 / actual.shape[0]) * sum_logs
        
    return np.average(class_scores)
def score_submission(pred_path="./", holdout_path='LabelData.csv'):
    # this happens on the backend to get the score
    holdout_labels = pd.get_dummies(pd.read_csv(holdout_path, index_col=0).apply(lambda x: x.astype('category'), axis=0))

    preds = pd.read_csv(pred_path, index_col=0)
    
    # make sure that format is correct
    assert (preds.columns == holdout_labels.columns).all()
    assert (preds.index == holdout_labels.index).all()

    return _multi_multi_log_loss(preds.values, holdout_labels.values)

prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns, index=act_holdout.index, 
                             data = predictions)
prediction_df.to_csv("predictions.csv")
score = score_submission(pred_path ="predictions.csv")
print('Your model, trained with numeric data only, yields logloss score: {}'.format(score))

Your model, trained with numeric data only, yields logloss score: 1.958799216922374


Even though your basic model scored 0.0 accuracy, it nevertheless performs better than the benchmark score of 2.0455.

# A very brief introduction to NLP

# Representing text numerically

   **Extrapolation from a confusion about how the CountVectorizer works...**

When we have columns of words that are significant for each data point and since we can not use them because of sklearn don't accept word we need to make them meaningful  mathematically while preserving their entity of words (sometimes their rows and grammar can be important and worth considering). By tokenizing them we can do that though while tokenizing the rows of the columns we need to combine them (the only way Countvectorizer works). After fitting the data Countvectorizer creates bags of words (the whole encountered tokens) and data becomes a sparse matrix...

Bag of words is one of the simplest ways to represent text in a machine learning algorithm. It discards information about grammar and word order, just assuming that the number of times a word accurs is enough information. -> CountVectorizer. 

In [242]:
from sklearn.feature_extraction.text import CountVectorizer
TOKENS_BASIC = "\\S+(?=\\s+)"
df.Program_Description.fillna("", inplace=True)
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)
vec_basic.fit(df.Program_Description)
len(vec_basic.get_feature_names())

434

### Creating a bag-of-words in scikit-learn

In [273]:
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
df.fillna("Position_Extra", inplace=True)
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)
vec_alphanumeric.fit(df["Position_Extra"])
msg = "There are {} tokens in Position_Extra if we split on non-alpha numeric"
print(msg.format(len(vec_alphanumeric.get_feature_names())))
print(vec_alphanumeric.get_feature_names()[:15])
print("\n")
TOKENS_BASIC = "\\S+(?=\\s+)"
df.fillna("Position_Extra", inplace=True)
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)
vec_basic.fit(df["Position_Extra"])
msg = "There are {} tokens in Position_Extra if we split on basic"
print(msg.format(len(vec_basic.get_feature_names())))
print(vec_basic.get_feature_names()[:15])

There are 385 tokens in Position_Extra if we split on non-alpha numeric
['1st', '2nd', '3rd', '4th', '56', '5th', '9th', 'a', 'ab', 'accountability', 'adaptive', 'addit', 'additional', 'adm', 'admin']


There are 415 tokens in Position_Extra if we split on non-alpha numeric
['&', '(no', '(slp)', '-', '-2nd', '1st', '2nd', '3rd', '4th', '56', '5th', '9th', 'a', 'ab', 'accountability']


### Combining text columns for tokenization

In order to get a bag-of-words representation for all of the text data in our DataFrame, you must first convert the text data in each row of the DataFrame into a single string.

In [244]:
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    text_data.fillna("",inplace=True)
    return text_data.apply(lambda x: " ".join(x), axis=1)


### What's in a token?


In [267]:
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

text_vector = combine_text_columns(df)

vec_basic.fit_transform(text_vector)
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names())))

vec_alphanumeric.fit_transform(text_vector)
print("There are {} alpha-numeric tokens in the dataset".format(len(vec_alphanumeric.get_feature_names())))


There are 4757 tokens in the dataset
There are 3284 alpha-numeric tokens in the dataset
