In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load Data

In [56]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', names=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'accep'])
print(df)

     buying  maint  doors persons lug_boot safety  accep
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]


# PreProcess Data

In [58]:
df['accep'] = ~(df['accep']=='unacc') #1 is acceptable, 0 if not acceptable
X = pd.get_dummies(df.iloc[:,0:6], drop_first=True)
y = df['accep']

# Split Data

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

# Build Model

In [5]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

# Score

In [6]:
print("Accuracy score of DT on test set (trained using full feature set):")
accuracy_dt = dt.score(x_test, y_test)
print(accuracy_dt)

Accuracy score of DT on test set (trained using full feature set):
0.9467592592592593


The accuracy of the model using all the features avalible is 94% which is a good accuracy score. 

# Test Model using Random Features

In [35]:
# 1. Create rand_features, random samples from the set of features
rand_features = np.random.choice(x_train.columns,10)

# Make new decision tree trained on random sample of 10 features and calculate the new accuracy score
dt2 = DecisionTreeClassifier()
dt2.fit(x_train[rand_features], y_train)
accuracy_dt2 = dt2.score(x_test[rand_features], y_test)
print("Accuracy score of DT on test set (trained using random feature sample):")
print(accuracy_dt2)

Accuracy score of DT on test set (trained using random feature sample):
0.8680555555555556


This takes only 10 columns from training set to see if any of the individual variables in a random order could create a better score than the full training set.

# Aggregated Random Features on 10 Samples


In [55]:
# 2. Build decision trees on 10 different random samples 
predictions = []
for i in range(10):
    rand_features = np.random.choice(x_train.columns,10)
    dt2.fit(x_train[rand_features], y_train)
    predictions.append(dt2.predict(x_test[rand_features]))

## 3. Get aggregate predictions and accuracy score
prob_predictions = np.array(predictions).mean(0)
agg_predictions = (prob_predictions > 0.5)
agg_accuracy = accuracy_score(y_test, agg_predictions)
print('Accuracy score of aggregated 10 samples:')
print(agg_accuracy)
print(len(agg_predictions))

Accuracy score of aggregated 10 samples:
0.7314814814814815
432


The aggregated score of the 10 samples ran from the random features in the cell above still under performs compared to the full training set. It looks like we need all of the features to get a good score in this dataset. 