In [6]:
import pickle
import pandas as pd
from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
def load_data(database_filepath):
    """Loads the data from a database file and returns the target."""

    engine = create_engine(f'sqlite:///{database_filepath}')
    df = pd.read_sql_table('emissions', engine)

    return df

In [8]:
df = load_data('../data/emissions.db')
df.shape

(45511, 29)

In [9]:
def split_data(df):
    """
    INPUT - df - full car emissions df
    
    OUTPUT
    X_train, X_test, y_train, y_test - test and train sets
    X_pop - parameter matrix for given tax band labels
    X_nan - parameter matrix of missing tax band values which cannot be used for training and testing the model. y_nan would be an empty
    """
    
    df_pop = df.dropna(subset=['tax_band'])
    X = df_pop.drop(columns='tax_band')
    y = df_pop['tax_band']
    
    X_train, X_test, y_train, y_test, = train_test_split(X, y)
    
    df_nan = df[df['tax_band'].isnull()]
    X_nan = df_nan.drop(columns='tax_band')

    return X_train, X_test, y_train, y_test, df_pop, X_nan

In [10]:
X_train, X_test, y_train, y_test, df_pop, X_nan = split_data(df)
X_train.shape, X_test.shape, y_train.shape, y_test.shape, df_pop.shape, X_nan.shape

((5877, 28), (1960, 28), (5877,), (1960,), (7837, 29), (37674, 28))

In [11]:
df_pop['manufacturer'].value_counts()

BMW                     890
Volkswagen              832
Mercedes-Benz           770
Vauxhall                533
Audi                    473
Ford                    441
Volvo                   432
Peugeot                 321
Kia                     313
Skoda                   306
Honda                   227
Citroen                 219
Mini                    158
Porsche                 146
Toyota                  145
Seat                    143
Renault                 139
Chevrolet               138
Hyundai                 123
Chrysler Jeep           109
Nissan                  107
Subaru                  104
Fiat                     89
Mazda                    88
Mitsubishi               85
Suzuki                   65
Land Rover               45
Aston Martin Lagonda     40
Smart                    40
Lexus                    37
Alfa Romeo               33
Jaguar Cars              32
SsangYong                30
Infiniti                 23
Bentley Motors           21
Ferrari             

In [12]:
small_mf = df_pop['manufacturer'].value_counts() <= 30
small_mf[small_mf].index

Index(['SsangYong', 'Infiniti', 'Bentley Motors', 'Ferrari', 'Maserati',
       'Corvette', 'Lotus', 'Morgan Motor Company', 'Rolls-Royce', 'Abarth',
       'Dacia', 'McLaren', 'Perodua', 'MG Motors UK', 'MG Motors Uk', 'LTI'],
      dtype='object')

In [13]:
X = X_train.copy()
group_small = lambda row : 'Other' if row in small_mf[small_mf].index else row
X['manufacturer'] = X['manufacturer'].apply(group_small)
X['manufacturer'].unique()

array(['Other', 'Ford', 'Mitsubishi', 'Peugeot', 'Audi', 'BMW',
       'Mercedes-Benz', 'Volkswagen', 'Seat', 'Chevrolet', 'Smart',
       'Honda', 'Vauxhall', 'Alfa Romeo', 'Hyundai', 'Kia', 'Nissan',
       'Volvo', 'Mazda', 'Chrysler Jeep', 'Renault', 'Skoda', 'Lexus',
       'Suzuki', 'Citroen', 'Fiat', 'Subaru', 'Porsche', 'Toyota', 'Mini',
       'Land Rover', 'Aston Martin Lagonda', 'Jaguar Cars'], dtype=object)

In [14]:
small_tm = df_pop['transmission'].value_counts() <= 30
small_tm[small_tm].index

Index(['M7', 'QA6', '5AT', 'SAT5', '4AT', 'AMT5', 'A6-AWD', 'A6x2', 'ET5',
       'ASM', 'DCT7', 'M6-AWD', 'SAT6', 'M6x2', '7SP. SSG', 'MultiDriv',
       'MultiDrive', 'A8-AWD', '5MTx2', 'Multi5', 'M5x2', 'A5-AWD', 'MTA5',
       'Multi6', 'S/A6', 'M8'],
      dtype='object')

In [15]:
df_pop['transmission_type'].value_counts()

Manual       4275
Automatic    3221
Name: transmission_type, dtype: int64

In [16]:
df_pop['fuel_type'].value_counts()

Diesel                      3960
Petrol                      3747
Petrol Hybrid                 62
Diesel Electric               22
Petrol / E85 (Flex Fuel)      16
Petrol Electric               13
Electricity                    7
Electricity/Petrol             5
CNG                            4
Electricity/Diesel             1
Name: fuel_type, dtype: int64

In [17]:
small_ft = df_pop['fuel_type'].value_counts() <= 30
small_ft[small_ft].index

Index(['Diesel Electric', 'Petrol / E85 (Flex Fuel)', 'Petrol Electric',
       'Electricity', 'Electricity/Petrol', 'CNG', 'Electricity/Diesel'],
      dtype='object')

In [36]:
class EmissionsTransformer(BaseEstimator, TransformerMixin):
    """
    Adds dummies to categorical columns and removes the original ones
    """

    def drop_columns(self, X):
        """Dropping irrelevant columns from the data set"""

        irrelevant_numeric = [
            'urban_metric', 'extra_urban_metric', 'urban_imperial',
            'extra_urban_imperial', 'combined_imperial', 'thc_nox_emissions',
            'fuel_cost_6000_miles', 'standard_12_months', 'standard_6_months',
            'first_year_12_months', 'first_year_6_months',
        ]
        X = X.drop(irrelevant_numeric, axis=1)

        irrelevant_categorical = ['model', 'description']
        X = X.drop(irrelevant_categorical, axis=1)

        return X

    def fill_columns(self, X):
        """Filling the numeric columns with the mean of these columns"""

        relevant_numeric = [
            'year', 'euro_standard', 'noise_level', 'engine_capacity',
            'combined_metric', 'fuel_cost_12000_miles', 'co2', 'thc_emissions',
            'co_emissions', 'nox_emissions', 'particulates_emissions',
        ]
        X[relevant_numeric] = X[relevant_numeric].fillna(X.mean())

        return X

    def adjust_categorical(self, X):
        """
        A few issues would occur when adding dummies to the categorical columns without this step.
        """

        small_mf = ['SsangYong', 'Infiniti', 'Bentley Motors', 'Ferrari', 'Maserati',
                    'Lotus', 'Corvette', 'Rolls-Royce', 'Morgan Motor Company', 'Abarth',
                    'Dacia', 'McLaren', 'Perodua', 'MG Motors UK', 'LTI', 'MG Motors Uk']
        X['manufacturer'] = X['manufacturer'].apply(
            lambda row: 'Other' if row in small_mf else row
        )

        small_tm = ['QA6', 'M7', '5AT', 'SAT5', '4AT', 'AMT5', 'A6-AWD', 'A6x2', 'ASM',
                    'DCT7', 'ET5', 'M6-AWD', 'SAT6', 'M6x2', '7SP. SSG', 'MultiDriv',
                    'MultiDrive', 'A8-AWD', 'Multi5', '5MTx2', 'M5x2', 'A5-AWD', 'Multi6',
                    'S/A6', 'MTA5', 'M8']
        X['transmission'] = X['transmission'].apply(
            lambda row: 'Other' if row in small_tm else row
        )

        small_ft = ['Diesel Electric', 'Petrol / E85 (Flex Fuel)', 'Petrol Electric',
                    'Electricity', 'Electricity/Petrol', 'CNG', 'Electricity/Diesel']
        X['fuel_type'] = X['fuel_type'].apply(lambda row: 'Other' if row in small_ft else row)

        return X

    def add_dummies(self, X):
        mf = pd.get_dummies(X['manufacturer'], prefix='manufacturer')
        tm = pd.get_dummies(X['transmission'], prefix='transmission')
        tmt = pd.get_dummies(X['transmission_type'], prefix='transmission_type')
        ft = pd.get_dummies(X['fuel_type'], prefix='fuel_type')

        X = pd.concat([X, mf, tm, tmt, ft], axis=1)
        X = X.drop(['manufacturer', 'transmission', 'transmission_type', 'fuel_type'], axis=1)
        return X

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = self.drop_columns(X)
        X = self.fill_columns(X)
        X = self.adjust_categorical(X)
        X = self.add_dummies(X)
        return X


In [37]:
transformer = EmissionsTransformer()

In [38]:
def build_model():
    """Describes the model used on the data, consisting of NLP transformers and
    an individual classifier of each category."""

    pipeline = Pipeline([
        ('et', EmissionsTransformer()),
        ('clf', RandomForestClassifier()),
    ])

    parameters = {
        'clf__criterion': ['gini', 'entropy'],
    }

    model = GridSearchCV(pipeline, param_grid=parameters)

    return model

In [39]:
model = build_model()

In [40]:
model.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('et', EmissionsTransformer()),
  ('clf', RandomForestClassifier())],
 'estimator__verbose': False,
 'estimator__et': EmissionsTransformer(),
 'estimator__clf': RandomForestClassifier(),
 'estimator__clf__bootstrap': True,
 'estimator__clf__ccp_alpha': 0.0,
 'estimator__clf__class_weight': None,
 'estimator__clf__criterion': 'gini',
 'estimator__clf__max_depth': None,
 'estimator__clf__max_features': 'auto',
 'estimator__clf__max_leaf_nodes': None,
 'estimator__clf__max_samples': None,
 'estimator__clf__min_impurity_decrease': 0.0,
 'estimator__clf__min_impurity_split': None,
 'estimator__clf__min_samples_leaf': 1,
 'estimator__clf__min_samples_split': 2,
 'estimator__clf__min_weight_fraction_leaf': 0.0,
 'estimator__clf__n_estimators': 100,
 'estimator__clf__n_jobs': None,
 'estimator__clf__oob_score': False,
 'estimator__clf__random_state': None,
 'estimator__clf__verbose': 0,
 'estimator__clf__warm_s

In [41]:
def evaluate_model(model, X_test, y_test):
    """Shows the accuracy, precision, and recall of the model."""

    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(model.best_params_)

In [42]:
def save_model(model, model_filepath):
    """Saves the model as a pickle file"""

    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

In [43]:
database_filepath = '../data/emissions.db'
model_filepath = 'classifier.pkl'

print('Loading data...')
df = load_data(database_filepath)

print('Splitting data...')
X_train, X_test, y_train, y_test, df_pop, X_nan = split_data(df)

print('Building model...')
model = build_model()

print('Training model...')
model.fit(X_train, y_train)

print('Evaluating model...')
evaluate_model(model, X_test, y_test)

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')

Loading data...
Splitting data...
Building model...
Training model...
Evaluating model...
              precision    recall  f1-score   support

           A       1.00      1.00      1.00        87
           B       0.97      1.00      0.98       111
           C       1.00      0.98      0.99       225
           D       1.00      1.00      1.00       227
           E       1.00      1.00      1.00       230
           F       1.00      1.00      1.00       227
           G       0.99      1.00      0.99       288
           H       0.98      0.96      0.97       113
           I       0.95      0.95      0.95        82
           J       0.98      1.00      0.99       113
           K       1.00      1.00      1.00       106
           L       1.00      1.00      1.00        52
           M       1.00      1.00      1.00        99

    accuracy                           0.99      1960
   macro avg       0.99      0.99      0.99      1960
weighted avg       0.99      0.99      0.99 