In [41]:
import pandas as pd

In [73]:
data = pd.read_csv('machine.data',names=['vendor','model','MYCT','MMIN','MMAX','CACH','CHMIN','CHMAX','PRP','ERP'])
label = data['ERP']
data = data.drop(columns=['ERP'])

In [74]:
# data = pd.read_csv('forestfires.csv')
# label = data['area']
# data = data.drop(columns=['area'])

In [75]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# data = data[['MMIN','MMAX','CACH','CHMIN','CHMAX','PRP']] ## Only numeric attributes
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, random_state=0)

In [76]:
all_cols = pd.get_dummies(data).columns
def add_missing_dummy_columns( d, columns ):
    missing_cols = set( columns ) - set( d.columns )
    for c in missing_cols:
        d[c] = 0

def fix_columns( d, columns ):  
    add_missing_dummy_columns( d, columns )

    # make sure we have all the columns we need
    assert( set( columns ) - set( d.columns ) == set())

    d = d[ columns ]
    return d

In [77]:
def create_node(samples, labels, depth, model=None, split_by_feature=None, threshold=None, loss=None, 
                childrens=[], nominal_value=None):
    if model==None:
        loss, model = linear_fit(samples,labels)
    return {
        'samples': samples,
        'labels': labels,
        'childrens': childrens,
        'model': model, 
        'loss': loss,
        'split_by_feature': split_by_feature,
        'threshold': threshold,
        'nominal_value':nominal_value,
        'depth':depth
    }

In [78]:
from sklearn.model_selection import train_test_split

def linear_fit(train, label):
    reg = Ridge().fit(pd.get_dummies(train), label)
    mse_score = mean_squared_error(reg.predict(pd.get_dummies(train)), label)
    wmse_score = mse_score*len(train)
    return wmse_score, reg

In [79]:
import sklearn
try:

    getattr(sklearn.linear_model, "Ridge")
except Exception:
    print("DA")

In [80]:
min_samples_leaf=2
def numeric_split(feature, train, label, threshold, depth):
    mask = train[feature] >= threshold
    right_node_samples, right_node_labels = train[mask], label[mask]
    left_node_samples, left_node_labels = train[~mask], label[~mask]
    if len(left_node_samples)<min_samples_leaf or len(right_node_samples)<min_samples_leaf:
        return {'childrens':[],'loss':None}
    right_node_wmse, right_node_model = linear_fit(right_node_samples, right_node_labels)
    left_node_wmse, left_node_model = linear_fit(left_node_samples, left_node_labels)
    wmse = (right_node_wmse + left_node_wmse)/len(train)
    mse, model = linear_fit(train,label)
    right_node = create_node(right_node_samples, right_node_labels, depth, right_node_model, loss=right_node_wmse)
    left_node = create_node(left_node_samples, left_node_labels, depth, left_node_model, loss=left_node_wmse)
    
    return create_node(train, label, depth, model ,loss=wmse, childrens=[right_node, left_node], split_by_feature=feature, threshold=threshold)


In [81]:
def node_from_nominal_group(df, depth, nominal_value):
    samples = df.drop('Y',axis=1)
    labels = df['Y']
    node = create_node(samples, labels, depth, nominal_value=nominal_value)
    print(node['nominal_value'])
    return node

def nominal_split(feature, train, label, depth):
    df = train.copy()
    df['Y'] = label
    filtered_nominal_groups = df.groupby(feature).filter(lambda x: len(x)>=min_samples_leaf).groupby(feature)
    if len(filtered_nominal_groups)<2:
        return None
    
    nodes = []
    wmse = 0
    for nominal_value, nominal_group in filtered_nominal_groups:
        node = node_from_nominal_group(nominal_group, depth, nominal_value)
        nodes.append(node)
        wmse += node['loss']
    
    mse, model = linear_fit(train, label) # We don't use the MSE of the linear_fit, but we train a model in order to use it 
                                          # at prediction, when the nominal feature value won't be one of the splitted values
    print(wmse, len(train), len(filtered_nominal_groups))
    wmse = wmse/len(train) 
    return create_node(train, label, depth, model ,loss=wmse, split_by_feature=feature, childrens=nodes)

In [82]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from operator import itemgetter

def split_by_mse(feature, node):
    if node['samples'][feature].dtype=='O':
        split = nominal_split(feature, node['samples'], node['labels'], node['depth']+1)
        if split==None or len(split['childrens'])<2:
            return None
    else:       
        attr_splits = list(set(node['samples'][feature].values))
        splits = []
        for split in attr_splits:
            split_childrens = numeric_split(feature, node['samples'], node['labels'], split, node['depth']+1)
            if len(split_childrens['childrens']) > 0 and all(children['loss'] is not None for children in split_childrens['childrens']):
                splits.append(split_childrens)
        if len(splits)==0:
            return None
        split = min(splits,key=itemgetter('loss'))
        # We filter None due to splits with low number of samples in one of the sides
        if node['nominal_value']!=None:
            split['nominal_value'] = node['nominal_value'] #Important to keep the nominal_value in case we splitted a categorical node
    return split

In [83]:
def split_node(node):
    features = node['samples'].columns
    splits = []
    for feature in features:
        split = split_by_mse(feature, node)
        if split!=None:
            splits.append(split)
    if len(splits)==0:
        return node
    curr_split = min(splits, key=itemgetter('loss'))
    return curr_split

In [84]:
tree_root = create_node(X_train, y_train,depth=0)

def expand_tree(node):
    node = split_node(node)
    if len(node['childrens'])==0:
        return node
    print("Splitted node at depth {d} by feature {f} and threshold {t}".format(d=node['depth'],f=node['split_by_feature'],t=node['threshold']))
    for index, children in enumerate(node['childrens']):
        print("Node {i} with {n} samples".format(i=index,n=len(children['samples'])))
        node['childrens'][index] = expand_tree(children)
        
    return node

In [None]:
tree = expand_tree(tree_root)

amdahl
apollo
burroughs
c.r.d
cambex
cdc
dec
dg
formation
gould
harris
honeywell
hp
ibm
ipl
magnuson
nas
ncr
nixdorf
perkin-elmer
prime
siemens
sperry
wang
538.8099861488878 140 24


In [59]:
def predict(tree, sample):
    if(len(tree['childrens'])==0): #Leaf
        data = fix_columns(pd.get_dummies(sample),pd.get_dummies(tree['samples']).columns)
        print(data.shape)
        return tree['model'].predict(data)
    if tree['threshold']==None: #Categorical
        found = False
        for index, child in enumerate(tree['childrens']):
            if sample[tree['split_by_feature']].values[0] == child['nominal_value']:
                print("Found")
                node = child
                found = True
                break
        if not found:
            print("Not found")
            return tree['model'].predict(fix_columns(pd.get_dummies(sample),pd.get_dummies(tree['samples']).columns))
    else: #Numeric
        if(sample[tree['split_by_feature']].values[0]>=tree['threshold']):
            node = tree['childrens'][0] #right node
        else:
            node = tree['childrens'][1]
    return predict(node, sample)

In [62]:
predict(tree, X_test.iloc[0:1])

Found
(1, 11)


array([35.99309084])

In [60]:
preds = [predict(tree,row.to_frame().T) for index,row in X_test.iterrows()]
mean_squared_error(preds,y_test)

Found
(1, 11)
Found
(1, 10)
Not found
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 10)
Found
(1, 10)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Not found
Not found
Not found
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Not found
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Not found
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 11)
Found
(1, 10)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 10)
Found
(1, 10)
Found
(1, 10)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 10)
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 10)
Found
(1, 10)
Not found
Found
(1, 11)
Found
(1, 11)
Found
(1, 10)
Found
(1, 10)
Found
(1, 10)
Found
(1, 11)


22632.07032736393

In [34]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
train_dummed = pd.get_dummies(X_train)
regressor.fit(train_dummed,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [35]:
y_1 = regressor.predict(fix_columns(pd.get_dummies(X_test), pd.get_dummies(X_train).columns))
mean_squared_error(y_1, y_test)

1113.3333333333333