# Automotive Data 

# importing packages

In [39]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVR
from sklearn import svm, metrics, linear_model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import io
import requests

# importing data from link

In [40]:
url="https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
s=requests.get(url).content
adata=pd.read_csv(io.StringIO(s.decode('utf-8')),
              names = ["symboling", "normalized-losses", "make", "fuel-type"
                           ,"aspiration", "num-of-doors", "body-style", "drive-wheels"
                           ,"engine-location", "wheel-base", "length", "width"
                           ,"height", "curb-weight", "engine-type", "num-of-cylinders"
                           ,"engine-size", "fuel-system", "bore", "stroke"
                           ,"compression-ratio", "horsepower", "peak-rpm", "city-mpg"
                           ,"highway-mpg", "price"
                          ]
             )

In [41]:
adata.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


# Impute malformed or non-numerical data - replaced with NAN

As part of data-pre-processing, I decided to include entries with missing values since there's not too much data (205 rows) and I decided to replace the missing values with the mean value of the column (Mean substitution) so as not to change the mean value of the variable and still be able to use the other features in that row in the classification.

In [42]:
adata['normalized-losses-imputed']=adata[['normalized-losses']].apply(pd.to_numeric,  errors='coerce')
adata['bore-imputed']=adata[['bore']].apply(pd.to_numeric,  errors='coerce')
adata['stroke-imputed']=adata[['stroke']].apply(pd.to_numeric,  errors='coerce')
adata['horsepower-imputed']=adata[['horsepower']].apply(pd.to_numeric,  errors='coerce')
adata['peak-rpm-imputed']=adata[['peak-rpm']].apply(pd.to_numeric,  errors='coerce')
adata['price-imputed']=adata[['price']].apply(pd.to_numeric,  errors='coerce')


In [43]:
adata.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,peak-rpm,city-mpg,highway-mpg,price,normalized-losses-imputed,bore-imputed,stroke-imputed,horsepower-imputed,peak-rpm-imputed,price-imputed
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,5000,21,27,13495,,3.47,2.68,111.0,5000.0,13495.0
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,5000,21,27,16500,,3.47,2.68,111.0,5000.0,16500.0
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,5000,19,26,16500,,2.68,3.47,154.0,5000.0,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,5500,24,30,13950,164.0,3.19,3.4,102.0,5500.0,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,5500,18,22,17450,164.0,3.19,3.4,115.0,5500.0,17450.0


# Impute Data - Replace NAN with mean value of that feature/column

In [44]:
adata["normalized-losses-imputed"].fillna(adata['normalized-losses-imputed'].mean(),inplace=True)
adata["bore-imputed"].fillna(adata['bore-imputed'].mean(),inplace=True)
adata["stroke-imputed"].fillna(adata['stroke-imputed'].mean(),inplace=True)
adata["horsepower-imputed"].fillna(adata['horsepower-imputed'].mean(),inplace=True)
adata["peak-rpm-imputed"].fillna(adata['peak-rpm-imputed'].mean(),inplace=True)
adata["price-imputed"].fillna(adata['price-imputed'].mean(),inplace=True)

In [45]:
adata.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,peak-rpm,city-mpg,highway-mpg,price,normalized-losses-imputed,bore-imputed,stroke-imputed,horsepower-imputed,peak-rpm-imputed,price-imputed
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,5000,21,27,13495,122.0,3.47,2.68,111.0,5000.0,13495.0
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,5000,21,27,16500,122.0,3.47,2.68,111.0,5000.0,16500.0
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,5000,19,26,16500,122.0,2.68,3.47,154.0,5000.0,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,5500,24,30,13950,164.0,3.19,3.4,102.0,5500.0,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,5500,18,22,17450,164.0,3.19,3.4,115.0,5500.0,17450.0


Separate features with continuous numerical data

In [46]:
vec_adata_cont = adata[[ 'symboling', 'normalized-losses', 'wheel-base', 'length' 
                          ,'width', 'height', 'curb-weight', 'engine-size' 
                          , 'compression-ratio', 'city-mpg', 'highway-mpg'
                          ,'normalized-losses-imputed','bore-imputed' 
                          ,'stroke-imputed','horsepower-imputed' 
                          ,'peak-rpm-imputed','price-imputed' 
                         ]]

As part of preprocessing and feature extraction, we separate the categorical features and transform them to dummy features/variables, creating multiple variables that correspond to the categories in the original feature. This is important in transforming the categorical values to numerical value, and especially when the categories are independent and have no numerical relation.

In [47]:
pd_adata_cat = pd.get_dummies(adata.drop( [ 'symboling', 'normalized-losses', 'wheel-base', 'length' 
                          ,'width', 'height', 'curb-weight', 'engine-size' 
                          ,'bore', 'stroke', 'compression-ratio', 'horsepower' 
                          ,'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
                          ,'normalized-losses-imputed','bore-imputed' 
                          ,'stroke-imputed','horsepower-imputed' 
                          ,'peak-rpm-imputed','price-imputed' 
                         ], axis = 1 ))
pd_adata_all = pd.concat([vec_adata_cont, pd_adata_cat], axis=1, join_axes=[vec_adata_cont.index])

  if __name__ == '__main__':


In [48]:
pd_adata_all.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,...,num-of-cylinders_twelve,num-of-cylinders_two,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,3,?,88.6,168.8,64.1,48.8,2548,130,9.0,21,...,0,0,0,0,0,0,0,1,0,0
1,3,?,88.6,168.8,64.1,48.8,2548,130,9.0,21,...,0,0,0,0,0,0,0,1,0,0
2,1,?,94.5,171.2,65.5,52.4,2823,152,9.0,19,...,0,0,0,0,0,0,0,1,0,0
3,2,164,99.8,176.6,66.2,54.3,2337,109,10.0,24,...,0,0,0,0,0,0,0,1,0,0
4,2,164,99.4,176.6,66.4,54.3,2824,136,8.0,18,...,0,0,0,0,0,0,0,1,0,0


Split train and test data with combined continuous and transformed categorical features (dummy variables)

In [49]:
train = pd_adata_all.sample(frac=0.8, random_state=1)
test = pd_adata_all.loc[~pd_adata_all.index.isin(train.index)]
label_train = train['symboling']
feat_train = train.drop(['symboling','normalized-losses'
                               ],axis=1)
label_test = test['symboling']
feat_test = test.drop(['symboling','normalized-losses'
                       ],axis=1)

# SVM

Since risk categories may be considered as discrete values or categories,
Let's train the risk predictor model using svm with the risk scores as classes/labels

In [50]:
classifier = svm.SVC(gamma='auto',kernel='poly',degree=1)
classifier.fit(feat_train, label_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Predict the test set and evaluate using precision, recall, f1-measure. Shown here also is the confusion matrix, a good way of inspecting where the models got it wrong.

In [52]:
classifier = svm.SVC(gamma='auto',kernel='poly',degree=1,random_state=50)
classifier.fit(feat_train, label_train)
expected = label_test
predicted = classifier.predict(feat_test)
print("Classification report for classifier %s:\n%s\n")

Classification report for classifier %s:
%s



In [53]:
classifier = svm.SVC(gamma='auto',kernel='poly',degree=1,random_state=50)
classifier.fit(feat_train, label_train)
expected = label_test
predicted = classifier.predict(feat_test)
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=1, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=50, shrinking=True, tol=0.001,
    verbose=False):
              precision    recall  f1-score   support

          -2       0.00      0.00      0.00         0
          -1       0.80      0.80      0.80         5
           0       0.53      0.69      0.60        13
           1       0.55      0.55      0.55        11
           2       0.50      0.17      0.25         6
           3       0.80      0.67      0.73         6

    accuracy                           0.59        41
   macro avg       0.53      0.48      0.49        41
weighted avg       0.60      0.59      0.58        41


Confusion matrix:
[[0 0 0 0 0 0]
 [0 4 1 0 0 0]
 [0 0 9 3 1 0]
 [1 0 3 6 0 1]
 [0 1 2 2 1 0]
 [0 0 2 0 0 4]]


  'recall', 'true', average, warn_for)


# Random Forest Classifier (Ensemble classifier)

Try a different classifier algorithm - this time, an ensemble classifier (Random forest)
An ensemble classifier utilizes multiple weak classifiers to improve the classification. Ensemble classifiers have been shown to improve accuracy when compared to single classifier models.

In [54]:
classifier = RandomForestClassifier(n_estimators=10,random_state=50)
classifier.fit(feat_train, label_train)
expected = label_test
predicted = classifier.predict(feat_test)
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Classification report for classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=50, verbose=0,
                       warm_start=False):
              precision    recall  f1-score   support

          -1       0.80      0.80      0.80         5
           0       0.75      0.92      0.83        13
           1       0.80      0.73      0.76        11
           2       0.67      0.33      0.44         6
           3       0.71      0.83      0.77         6

    accuracy                           0.76        41
   macro avg       0.75      0.72      0.72        41
weighted avg       0.75      0.76      0

# Gradient Boosting Regression + Feature Normalization

The features are expressed using different units and have different range. This time normalize the different features to values within -1 and 1 based on minimum and maximum values. 

In [55]:
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
temp=pd_adata_all.drop(['symboling','normalized-losses'
                       ],axis=1)
pd_adata_all_scaled = pd.DataFrame(scaler.fit_transform(temp), columns=temp.columns)
pd_adata_all_scaled2 = pd.concat([pd_adata_all_scaled, pd_adata_all['symboling']], axis=1, join_axes=[pd_adata_all_scaled.index])

  """


Train a regression algorithm (Gradient Boosting Regressor).


In [56]:
train = pd_adata_all_scaled2.sample(frac=0.8, random_state=1)
test = pd_adata_all_scaled2.loc[~pd_adata_all_scaled2.index.isin(train.index)]
label_train = train['symboling']
feat_train = train.drop(['symboling'
                               ],axis=1)
label_test = test['symboling']
feat_test = test.drop(['symboling'
                       ],axis=1)
classifier=GradientBoostingRegressor(n_estimators=500, learning_rate=0.1,
                                    max_depth=1, random_state=0, loss='ls')
classifier.fit(feat_train, label_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=1,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='auto', random_state=0,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

# Evaluation (Regression)

Evaluate based on mean squared error. For a regression task, this is a better evaluation metric since it captures how far the predicted values are from the target value.

It's more telling because it gives you an idea of how 'far' your prediction was from the actual data. For risk scoring, not all mis-predictions are equal, it matters how far the guess is from the target.


In [57]:
expected = label_test
predicted = classifier.predict(feat_test)
print ('Mean squared error is: ' , mean_squared_error(label_test,predicted))    
print (predicted)

Mean squared error is:  0.2977217818799533
[ 2.99978153  0.34993047  0.48457079  0.06140872  1.12713294  0.95153598
  2.0157146   0.92242342  1.26494417  3.1101926   0.08797684  0.97669435
  0.86155473 -0.64491901 -0.42105001  1.42110943  1.93925121  0.59015165
  0.61268402 -0.12216654  0.07666441  1.03386739  2.72840671  1.23989317
  1.15898231  1.43323676  2.68478851  1.49701726  0.87793946  1.31869491
  0.29110968  0.29110968  0.15333731  0.17813488  0.02512053  0.1222496
  1.88509487 -0.83624766  0.86917628 -1.29551473 -1.2177775 ]
