In [1]:
import pickle
import pandas as pd
from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

In [62]:
def load_data(database_filepath):
    """Loads the data from a database file and returns the target."""

    engine = create_engine(f'sqlite:///{database_filepath}')
    df = pd.read_sql_table('emissions', engine)

    return df

In [57]:
X, y, X_unlabelled = load_data('../data/emissions.db')
X.shape, y.shape, X_unlabelled.shape

((7837, 28), (7837,), (37674, 28))

In [49]:
X.count()

year                      7837
manufacturer              7837
model                     7837
description               7837
euro_standard             7837
transmission              7827
transmission_type         7496
engine_capacity           7830
fuel_type                 7837
urban_metric              7824
extra_urban_metric        7824
combined_metric           7830
urban_imperial            7824
extra_urban_imperial      7824
combined_imperial         7830
noise_level               7837
co2                       7837
thc_emissions             3845
co_emissions              7830
nox_emissions             7830
thc_nox_emissions         3990
particulates_emissions    3667
fuel_cost_12000_miles     7833
fuel_cost_6000_miles         0
standard_12_months        3517
standard_6_months         2926
first_year_12_months      3517
first_year_6_months       1419
dtype: int64

In [59]:
class EmissionsTransformer(BaseEstimator, TransformerMixin):
    """
    Adds dummies to categorical columns and removes the original ones
    """
    
    def drop_columns(self, X):
        """
        Dropping model and description as they would create too many dummies.
        Dropping fuel_cost_6000_miles as it contains only empty values.
        """
        
        X = X.drop(['model', 'description', 'fuel_cost_6000_miles'], axis=1)
        return X
    
    def fill_columns(self, X):
        numeric_columns = X.select_dtypes(exclude=['object']).columns
        X[numeric_columns] = X[numeric_columns].fillna(X.mean())
        return X

    def add_dummies(self, X):
        mf = pd.get_dummies(X['manufacturer'], prefix='manufacturer')
        tm = pd.get_dummies(X['transmission'], prefix='transmission')
        tmt = pd.get_dummies(X['transmission_type'], prefix='transmission_type')
        ft = pd.get_dummies(X['fuel_type'], prefix='fuel_type')
        
        X = pd.concat([X, mf, tm, tmt, ft], axis=1)
        X = X.drop(['manufacturer', 'transmission', 'transmission_type', 'fuel_type'], axis=1)
        return X
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = self.drop_columns(X)
        X = self.fill_columns(X)
#         X = self.add_dummies(X)
        return X

    def preprocess(self, df):
        df = self.add_dummies(df)
        return df

In [66]:
def split_data(df):
    df_pop = df.dropna(subset=['tax_band'])
    df_nan = df[df['tax_band'].isnull()]

    X = df_pop.drop(columns='tax_band')
    y = df_pop['tax_band']
    X_nan = df_nan.drop(columns='tax_band')

    return X, y, X_nan

In [None]:
df = load_data('../data/emissions.db')
transformer = EmissionsTransformer()
df_preprocessed = transformer.preprocess(df)
X, y, X_nan = split_data(df_preprocessed)



In [7]:
transformer = EmissionsTransformer()
X_with_dummies = transformer.add_dummies(X)
X_with_dummies.head()

Unnamed: 0,year,model,description,euro_standard,engine_capacity,urban_metric,extra_urban_metric,combined_metric,urban_imperial,extra_urban_imperial,...,fuel_type_CNG,fuel_type_Diesel,fuel_type_Diesel Electric,fuel_type_Electricity,fuel_type_Electricity/Diesel,fuel_type_Electricity/Petrol,fuel_type_Petrol,fuel_type_Petrol / E85 (Flex Fuel),fuel_type_Petrol Electric,fuel_type_Petrol Hybrid
37674,2012,500,500 and 500C,5,1368.0,8.5,5.4,6.5,33.2,52.3,...,0,0,0,0,0,0,1,0,0,0
37675,2012,500,500 and 500C,5,1368.0,8.4,5.4,6.5,33.6,52.3,...,0,0,0,0,0,0,1,0,0,0
37676,2012,500,595 and 595C,5,1368.0,8.4,5.4,6.5,33.6,52.3,...,0,0,0,0,0,0,1,0,0,0
37677,2012,500,595 and 595C,5,1368.0,8.5,5.4,6.5,33.2,52.3,...,0,0,0,0,0,0,1,0,0,0
37678,2012,Punto Evo,1.4 16v Turbo MultiAir 165,5,1368.0,8.0,5.0,6.1,35.3,56.5,...,0,0,0,0,0,0,1,0,0,0


In [45]:
def build_model():
    """Describes the model used on the data, consisting of NLP transformers and
    an individual classifier of each category."""

    pipeline = Pipeline([
        ('et', EmissionsTransformer()),
        ('clf', RandomForestClassifier()),
    ])

    parameters = {
        'clf__criterion': ['gini', 'entropy'],
    }

    model = GridSearchCV(pipeline, param_grid=parameters)

    return model

In [46]:
model.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('et', EmissionsTransformer()),
  ('clf', RandomForestClassifier())],
 'estimator__verbose': False,
 'estimator__et': EmissionsTransformer(),
 'estimator__clf': RandomForestClassifier(),
 'estimator__clf__bootstrap': True,
 'estimator__clf__ccp_alpha': 0.0,
 'estimator__clf__class_weight': None,
 'estimator__clf__criterion': 'gini',
 'estimator__clf__max_depth': None,
 'estimator__clf__max_features': 'auto',
 'estimator__clf__max_leaf_nodes': None,
 'estimator__clf__max_samples': None,
 'estimator__clf__min_impurity_decrease': 0.0,
 'estimator__clf__min_impurity_split': None,
 'estimator__clf__min_samples_leaf': 1,
 'estimator__clf__min_samples_split': 2,
 'estimator__clf__min_weight_fraction_leaf': 0.0,
 'estimator__clf__n_estimators': 100,
 'estimator__clf__n_jobs': None,
 'estimator__clf__oob_score': False,
 'estimator__clf__random_state': None,
 'estimator__clf__verbose': 0,
 'estimator__clf__warm_s

In [47]:
def evaluate_model(model, X_test, y_test):
    """Shows the accuracy, precision, and recall of the model."""

    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(model.best_params_)

In [48]:
def save_model(model, model_filepath):
    """Saves the model as a pickle file"""

    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

In [67]:
database_filepath = '../data/emissions.db'
model_filepath = 'classifier.pkl'

df = load_data('../data/emissions.db')
transformer = EmissionsTransformer()
df_preprocessed = transformer.preprocess(df)
X, y, X_nan = split_data(df_preprocessed)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, y_train)

print('Evaluating model...')
evaluate_model(model, X_test, y_test)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')

Building model...
Training model...
Evaluating model...
              precision    recall  f1-score   support

           A       0.98      1.00      0.99        61
           B       1.00      0.97      0.99        74
           C       0.97      1.00      0.98       157
           D       1.00      0.98      0.99       197
           E       1.00      1.00      1.00       222
           F       0.99      0.99      0.99       178
           G       1.00      1.00      1.00       223
           H       0.98      0.98      0.98       108
           I       0.98      0.96      0.97        67
           J       1.00      1.00      1.00       100
           K       0.97      0.99      0.98        69
           L       0.98      0.98      0.98        43
           M       1.00      1.00      1.00        69

    accuracy                           0.99      1568
   macro avg       0.99      0.99      0.99      1568
weighted avg       0.99      0.99      0.99      1568

{'clf__criterion': 'gin

In [58]:
X_nan_pop.shape

(37674, 28)

In [92]:
df_transformed = transformer.preprocess(df)
df_transformed.shape

(45511, 193)

In [93]:
df_transformed = transformer.fill_columns(df_transformed)
df_transformed.shape

(45511, 193)

In [94]:
df_transformed = df_transformed[df_transformed['tax_band'].isnull()]
df_transformed.shape

(37674, 193)

In [95]:
df_transformed = df_transformed.drop(columns='tax_band')
df_transformed.shape

(37674, 192)

In [96]:
y_predicted = model.predict(df_transformed)
len(y_predicted)

37674

In [99]:
df_test = df.copy()

In [100]:
df_test['tax_band'][:37674] = y_predicted
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['tax_band'][:37674] = y_predicted


Unnamed: 0,year,manufacturer,model,description,euro_standard,tax_band,transmission,transmission_type,engine_capacity,fuel_type,...,co_emissions,nox_emissions,thc_nox_emissions,particulates_emissions,fuel_cost_12000_miles,fuel_cost_6000_miles,standard_12_months,standard_6_months,first_year_12_months,first_year_6_months
0,2000,Alfa Romeo,145 Range,1.6 Twin Spark 16v,2,J,M5,Manual,1598.0,Petrol,...,980.0,,,,,618.0,,,,
1,2000,Alfa Romeo,145 Range,1.8 Twin Spark 16v,2,J,M5,Manual,1747.0,Petrol,...,1105.0,,,,,633.0,,,,
2,2000,Alfa Romeo,145 Range,Cloverleaf,2,K,M5,Manual,1970.0,Petrol,...,1103.0,,,,,663.0,,,,
3,2000,Alfa Romeo,146 Range,1.6 Twin Spark 16v,2,J,M5,Manual,1598.0,Petrol,...,980.0,,,,,626.0,,,,
4,2000,Alfa Romeo,146 Range,1.8 Twin Spark 16v,2,J,M5,Manual,1747.0,Petrol,...,1105.0,,,,,633.0,,,,


In [101]:
df_test['tax_band'].count()

45511

In [102]:
def save_data(df, database_filename):
    """Saves the data as a database file"""

    engine = create_engine(f'sqlite:///{database_filename}')
    df.to_sql('emissions', engine, index=False, if_exists='replace')

In [103]:
save_data(df_test, '../data_filled.db')