# Bringing It All Together

References:

[Github Repo](https://github.com/datacamp/course-resources-ml-with-experts-budgets)  
[DriveData Competition Page](https://www.drivendata.org/competitions/46/box-plots-for-education-reboot)  


In [1]:
# ignore deprecation warnings in sklearn
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# import our custom train_test split function
from multilabel import multilabel_train_test_split

# set seed for reproducibility
np.random.seed(0)

df = pd.read_csv('../data/TrainingData.csv',index_col=0)

# prepare the training and test data
NUMERIC_COLUMNS = ['FTE', 'Total']
LABELS = ['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']

NON_LABELS = [column for column in df.columns if column not in LABELS]

labels = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = multilabel_train_test_split(
    df[NON_LABELS],
    labels,
    0.2,
    seed=123
)

In [2]:
from sklearn.preprocessing import FunctionTransformer

# useful text preprocessing functions
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

get_text_data = FunctionTransformer(combine_text_columns, validate=False)

get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

### Build, fit and score the pipeline

In [4]:
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import PolynomialFeatures

TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# The parameters non_negative=True, norm=None, and binary=False make the 
# HashingVectorizer perform similarly to the default settings on the 
# CountVectorizer so you can just replace one with the other.
pl = Pipeline([
    ('union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', HashingVectorizer(
                norm=None,
                binary=False,
                non_negative=True,
                token_pattern=TOKENS_ALPHANUMERIC,
                ngram_range=(1,2)
            ))
        ]))
    ])),
    ('int', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('clf',  OneVsRestClassifier(LogisticRegression()))
])

pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)

KeyboardInterrupt: 

**Actual result**:

Log loss: 1.2258. Performance is about the same as using a `CountVectorizer`, but this is expected since the `HashingVectorizer` should work the same.

### Submit predictions

Export predictions to csv in the appropriate formate for uploading to the DataDriven competition web site.

Because all the preprocessing has been built into the pipeline, we don't have to do any preprocessing to the holdout data, `TestData.csv`. 

In [None]:
holdout = pd.read_csv('../data/TestData.csv', index_col=0)

predictions = pl.predict_proba(holdout)
columns = pd.get_dummies(df[LABELS], prefix_sep='__').columns

# create a dataframe of the predictions
df_predictions = pd.DataFrame(
    columns=columns,
    index=holdout.index,
    data=predictions
)

df_predictions.to_csv('../predictions/predictions.csv')

df_predictions.head()

In [None]:
df_predictions.shape