In [3]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
path = "./data/"


filename_read=os.path.join(path,"auto-mpg.csv")
filename_write=os.path.join(path,"auto-mpg-put-of-sample.csv")
df=pd.read_csv(filename_read,na_values=['NA','?'])

def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)
    
    
#Shuffle
np.random.seed(42)
df=df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True,drop=True)

#Preprocess
cars=df['name']
df.drop('name',1,inplace=True)
missing_median(df,'horsepower')

#Encode to 2D matrix for training
x,y=to_xy(df,'mpg')

#Cross-Validate
kf=KFold(5)

oos_y=[]
oos_pred=[]
fold=0

for train,test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
    
    x_train=x[train]
    y_train=y[train]
    x_test=x[test]
    y_test=y[test]
    
    model=Sequential()
    model.add(Dense(20, input_dim=x.shape[1],activation='relu'))
    model.add(Dense(10,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error',optimizer='adam')
    
    monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=5,verbose=1,mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred=model.predict(x_test)
    oos_y.append(y_test)
    oos_pred.append(pred)
    
    #Measure this fold's RMSE
    score=np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))
    
#Build oos prediction list and calculate the error
oos_y=np.concatenate(oos_y)
oos_pred=np.concatenate(oos_pred)
score=np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("final, out of sample score (RMSE):{}".format(score))

#Write cross validated prediction
oos_y=pd.DataFrame(oos_y)
oos_pred=pd.DataFrame(oos_pred)
oosDF=pd.concat([df,oos_y,oos_pred],axis=1)
oosDF.to_csv(filename_write,index=False)

Fold #1
Epoch 00223: early stopping
Fold score (RMSE): 9.043562889099121
Fold #2
Epoch 00084: early stopping
Fold score (RMSE): 3.758021593093872
Fold #3
Epoch 00300: early stopping
Fold score (RMSE): 3.571667432785034
Fold #4
Epoch 00010: early stopping
Fold score (RMSE): 13.354731559753418
Fold #5
Epoch 00418: early stopping
Fold score (RMSE): 4.244455814361572
final, out of sample score (RMSE):7.79866361618042


In [4]:
x,y

(array([[   4.        ,   91.        ,   53.        , ...,   17.39999962,
           76.        ,    3.        ],
        [   4.        ,  120.        ,   79.        , ...,   18.60000038,
           82.        ,    1.        ],
        [   6.        ,  232.        ,  100.        , ...,   13.        ,
           71.        ,    1.        ],
        ..., 
        [   4.        ,  134.        ,   95.        , ...,   14.80000019,
           78.        ,    3.        ],
        [   4.        ,   89.        ,   62.        , ...,   17.29999924,
           81.        ,    3.        ],
        [   4.        ,   97.        ,   46.        , ...,   21.        ,
           73.        ,    2.        ]], dtype=float32), array([[ 33.        ],
        [ 28.        ],
        [ 19.        ],
        [ 13.        ],
        [ 14.        ],
        [ 27.        ],
        [ 24.        ],
        [ 13.        ],
        [ 17.        ],
        [ 21.        ],
        [ 15.        ],
        [ 38.        ]

In [22]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics, preprocessing
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation

path = "./data/"

filename_read = os.path.join(path,"iris.csv")
filename_write = os.path.join(path,"iris-out-of-sample.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

#Encode to a 2Dmatrix for trainig
species=encode_text_index(df,"species")
x,y=to_xy(df,"species")    

kf=KFold(5)

oos_y = []
oos_pred = []
fold = 0

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model=Sequential()
    model.add(Dense(50,input_dim=x.shape[1],activation='relu'))#Hidden1
    model.add(Dense(25,activation='relu'))#Hidden 2
    model.add(Dense(y.shape[1],activation='softmax'))#Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=25,verbose=1,mode='auto')
    
    model.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred=model.predict(x_test)
    #print(pred)
    oos_y.append(y_test)
    pred=np.argmax(pred,axis=1) #raw probabilities to chosen class
    oos_pred.append(pred)
    
    #measure this fold's accuracy
    y_compare=np.argmax(y_test,axis=1)
    score=metrics.accuracy_score(y_compare,pred)
    print('Fold score (accuracy): {}'.format(score))
    
# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
oos_y_compare = np.argmax(oos_y,axis=1) # For accuracy calculation

score = metrics.accuracy_score(oos_y_compare, oos_pred)
print("Final score (accuracy): {}".format(score))    
    
# Write the cross-validated prediction
oos_y = pd.DataFrame(oos_y_compare)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
oosDF.to_csv(filename_write,index=False)    

Fold #1
Epoch 00332: early stopping
Fold score (accuracy): 1.0
Fold #2
Epoch 00201: early stopping
Fold score (accuracy): 0.9666666666666667
Fold #3
Epoch 00411: early stopping
Fold score (accuracy): 0.9666666666666667
Fold #4
Epoch 00164: early stopping
Fold score (accuracy): 0.9333333333333333
Fold #5
Epoch 00277: early stopping
Fold score (accuracy): 1.0
Final score (accuracy): 0.9733333333333334


In [26]:
from sklearn.model_selection import train_test_split
path = "./data/"

filename_read = os.path.join(path,"auto-mpg.csv")
filename_write = os.path.join(path,"auto-mpg-holdout.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#create a feature vector
missing_median(df,'horsepower')
df.drop('name',1,inplace=True)
encode_text_dummy(df,'origin')

# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Encode to a 2D matrix for training
x,y = to_xy(df,'mpg')

#keep 10% holdout set
x_main, x_holdout, y_main, y_holdout=train_test_split(x,y,test_size=0.10)
kf=KFold(5)
oos_y=[]
oos_pred=[]
fold=0
for train, test in kf.split(x_main):
    fold+=1
    print('Fold #{}'.format(fold))
    
    x_train = x_main[train]
    y_train = y_main[train]
    x_test = x_main[test]
    y_test = y_main[test]
    
    model=Sequential()
    model.add(Dense(20, input_dim=x.shape[1],activation='relu'))
    model.add(Dense(5,activation='relu'))
    model.add(Dense(1))  
    model.compile(loss='mean_squared_error',optimizer='adam')
    
    monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=25,verbose=1,mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
        
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred) 

    # Measure accuracy
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print()
print("Cross-validated score (RMSE): {}".format(score)) 

holdout_pred=model.predict(x_holdout)
score=np.sqrt(metrics.mean_squared_error(holdout_pred,y_holdout))
print('Holdout RMSE : {}'.format(score))

Fold #1
Epoch 00079: early stopping
Fold score (RMSE): 15.866439819335938
Fold #2
Epoch 00478: early stopping
Fold score (RMSE): 3.6949167251586914
Fold #3
Epoch 00674: early stopping
Fold score (RMSE): 3.791503429412842
Fold #4
Epoch 00281: early stopping
Fold score (RMSE): 4.095532417297363
Fold #5
Fold score (RMSE): 16.953285217285156

Cross-validated score (RMSE): 10.797898292541504
Holdout RMSE : 18.55446434020996
