In [7]:
import sys
import pickle
import pandas as pd
from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
def load_data(database_filepath):
    """Loads the data from a database file and returns the target."""

    engine = create_engine(f'sqlite:///{database_filepath}')
    df = pd.read_sql_table('emissions', engine)

    X = df.drop(columns='tax_band')
    y = df['tax_band']

    return X, y

In [5]:
X, y = load_data('../data/emissions.db')
X.shape, y.shape

((45511, 28), (45511,))

In [8]:
class DummyTransformer(BaseEstimator, TransformerMixin):
    """
    Adds dummies to categorical columns and removes the original ones
    """

    def add_dummies(self, X):
        return X

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = self.add_dummies(X)
        return X

In [9]:
dummy_transformer = DummyTransformer()
Xdummy_transformer

In [None]:
def build_model():
    """Describes the model used on the data, consisting of NLP transformers and
    an individual classifier of each category."""

    pipeline = Pipeline([
        ('tfidf', DummyTransformer()),
        ('clf', RandomForestClassifier(n_estimators=10)),
    ])

    parameters = {
        'clf__estimator__criterion': ['gini', 'entropy'],
    }

    model = GridSearchCV(pipeline, param_grid=parameters)

    return model

In [None]:
def evaluate_model(model, X_test, y_test):
    """Shows the accuracy, precision, and recall of the model."""

    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(model.best_params_)

In [None]:
def save_model(model, model_filepath):
    """Saves the model as a pickle file"""

    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)