In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from pandas.tools.plotting import scatter_matrix

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold

#Class, for use in pipelines, to select certain columns from a DataFram
#e and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlo
#w, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pand
#as.DatFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values
    
    
# Class, for use in pipelines, to binarize nominal-valued features (whil
#e avoiding the dummy variabe trap)
# By Derek Bridge, 2017
class FeatureBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, features_values):
        self.features_values = features_values
        self.num_features = len(features_values)
        self.labelencodings = [LabelEncoder().fit(feature_values) for \
        feature_values in features_values]
        self.onehotencoder = OneHotEncoder(sparse=False, \
        n_values=[len(feature_values) for feature_values in \
                  features_values])
        self.last_indexes = np.cumsum([len(feature_values) - 1 for \
        feature_values in self.features_values])
        
    def fit(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        return self.onehotencoder.fit(X)
    
    def transform(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        onehotencoded = self.onehotencoder.transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    
    def fit_transform(self, X, y=None):
        onehotencoded = self.fit(X).transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    
    def get_params(self, deep=True):
        return {"features_values" : self.features_values}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self
    
# Class, for use in pipelines, to impute missing values but which overco
#mes a problem that scikit-learn's class has
# when imputing modes on nominal-valued features
# From https://stackoverflow.com/questions/25239958/impute-categorical-m
#issing-values-in-scikit-learn
# Original has a casting problem

In [4]:
cars_df = pd.read_csv('CS4618Resources/datasets/dataset_mpg.csv')

In [5]:
cars_df.shape

(398, 8)

In [6]:
cars_df.describe(include='all')

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
unique,,,94.0,,,,,
top,,,150.0,,,,,
freq,,,22.0,,,,,
mean,5.454774,193.425879,,2970.424623,15.56809,76.01005,1.572864,23.514573
std,1.701004,104.269838,,846.841774,2.757689,3.697627,0.802055,7.815984
min,3.0,68.0,,1613.0,8.0,70.0,1.0,9.0
25%,4.0,104.25,,2223.75,13.825,73.0,1.0,17.5
50%,4.0,148.5,,2803.5,15.5,76.0,1.0,23.0
75%,8.0,262.0,,3608.0,17.175,79.0,2.0,29.0


Build a system that uses regression that predicts how many MPG our cars should get.
Use error estimation to say how good it is.


In [7]:
hp = cars_df["horsepower"]
print(hp.dtype)

object


Remove the '?' values in horsepower and then turn it into numeric values

===

See how many '?' values there are. 
I deleted all of them, but they could be replaced by the mean.

Should have made sure that the type of the column was still 'object'.

In [8]:
cars_df = (cars_df[cars_df["horsepower"] != "?"]).copy()
cars_df.reset_index(drop=True, inplace=True)
# Check the anomalies were filtered out
cars_df.shape

(392, 8)

In [9]:
hp = cars_df["horsepower"]

In [10]:
hp = pd.to_numeric(hp)
cars_df["horsepower"] = hp

In [11]:
cars_df.describe(include='all')

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531,23.445918
std,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518,7.805007
min,3.0,68.0,46.0,1613.0,8.0,70.0,1.0,9.0
25%,4.0,105.0,75.0,2225.25,13.775,73.0,1.0,17.0
50%,4.0,151.0,93.5,2803.5,15.5,76.0,1.0,22.75
75%,8.0,275.75,126.0,3614.75,17.025,79.0,2.0,29.0
max,8.0,455.0,230.0,5140.0,24.8,82.0,3.0,46.6


In [12]:
numeric_features = ["cylinders","displacement","horsepower","weight","acceleration","year"]

#nominal_features = ["origin"]
numeric_pipeline = Pipeline([("selector", DataFrameSelector(numeric_features)),
                            ("estimator", LinearRegression())
                            ])

Can use scatter matrix to create some nice pictures :)

We don't scale when we do linear regression. This is because of the normal equation.

We don't need to do PCA because we only have 6 features - only do PCA when we have loads of features.
In my pipeline, should have done estimator - linear regression instead of scaler.

In [13]:
#Should shuffle data here
cars_df = cars_df.take(np.random.permutation(len(cars_df)))

y = cars_df["mpg"].values

np.mean(cross_val_score(numeric_pipeline, cars_df, y, scoring="neg_mean_absolute_error", cv=10))

-2.669495145511009

In this example, it was especially important to shuffle the data because the data was ordered by year. <b>Should look out for ordered data when examining it.</b>

We also included origin as numeric data.

For next week : Do a new pipeline that treats the origin as nominal data, and calculate the thingy for that.

In [14]:
numeric_features = ["cylinders","displacement","horsepower","weight","acceleration","year"]

nominal_features = ["origin"]

nominal_features_values = [["1", "2", "3"]]
#Can use (df[feature].unique() for feature in nominal_features) instead of including this
#list as a parameter for FeatureBinarizer().

#Nominal Pipeline
nominal_pipeline = Pipeline([("selector", DataFrameSelector(nominal_features)),
                            ("binarizer", FeatureBinarizer(nominal_features_values))
                            ])
#Numeric Pipeline
numeric_pipeline = Pipeline([("selector", DataFrameSelector(numeric_features))
                            ])
#Combined Pipeline
pipeline = Pipeline([("union", FeatureUnion([("numeric_pipeline", numeric_pipeline),
                                            ("nominal_pipeline", nominal_pipeline)])),
                    ("estimator", LinearRegression())])


In [16]:
cars_df = cars_df.take(np.random.permutation(len(cars_df)))

y = cars_df["mpg"].values

np.mean(cross_val_score(pipeline, cars_df, y, scoring="neg_mean_absolute_error", cv=10))

-2.5776063444535038