# Multicolumn-text and numeircal data pipeline

In [1]:
# ignore deprecation warnings in sklearn
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# import our custom train_test split function
from multilabel import multilabel_train_test_split

# set seed for reproducibility
np.random.seed(0)

df = pd.read_csv('../data/TrainingData.csv',index_col=0)

NUMERIC_COLUMNS = ['FTE', 'Total']
LABELS = ['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']

NON_LABELS = [column for column in df.columns if column not in LABELS]

Useful preprocessing functions

In [2]:
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

get_text_data = FunctionTransformer(combine_text_columns, validate=False)

get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

In [4]:
labels = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = multilabel_train_test_split(
    df[NON_LABELS],
    labels,
    0.2,
    seed=123
)

pipepline = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])),
    ('clf', OneVsRestClassifier(LogisticRegression()))
])

pipepline.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pipepline.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)




Accuracy on budget dataset:  0.35722940478421084


**Actual Result**:

Accuracy on budget dataset:  0.203846153846

Using a pipeline provides flexibility, we can quickly try different models, e.g. knn, Naive Bayes, Random Forest, etc, with our pipeline by simply chaning the `clf` step - the estimator.

Using the same model with a Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])),
    ('clf', RandomForestClassifier())
])

pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)


Accuracy on budget dataset:  0.9040409718318656


**Actual result**:

Accuracy on budget dataset:  0.282692307692

Using a `RandomForestClassifier` improved the accuracy of the model. We'll try ti improve that firgure by ajusting the classifier's `n_estimators` value from the default `10` to `15`.

In [7]:
pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])),
    ('clf', RandomForestClassifier(n_estimators=15))
])

pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)


Accuracy on budget dataset:  0.9123352695022172


**Actual results**:

Accuracy on budget dataset:  0.325