In [2]:
import tensorflow.contrib.learn as skflow
import os
from sklearn import metrics
from scipy.stats import zscore
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = "./data/"

filename_read = os.path.join(path,"auto-mpg.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

In [3]:
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
missing_median(df,'horsepower')
#df.drop('name',1,inplace=True)



print('Length before MPG outliers are dropped {}'.format(len(df)))
remove_outliers(df,'horsepower',2)
print('Length after {}'.format(len(df)))
print(df)

Length before MPG outliers are dropped 398
Length after 381
      mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0    18.0          8         307.0       130.0    3504          12.0    70   
1    15.0          8         350.0       165.0    3693          11.5    70   
2    18.0          8         318.0       150.0    3436          11.0    70   
3    16.0          8         304.0       150.0    3433          12.0    70   
4    17.0          8         302.0       140.0    3449          10.5    70   
10   15.0          8         383.0       170.0    3563          10.0    70   
11   14.0          8         340.0       160.0    3609           8.0    70   
12   15.0          8         400.0       150.0    3761           9.5    70   
14   24.0          4         113.0        95.0    2372          15.0    70   
15   22.0          6         198.0        95.0    2833          15.5    70   
16   18.0          6         199.0        97.0    2774          15.5    70   
17  

In [None]:
#Dealing with addresses onwards...

In [7]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

path="./data/"

filename_read=os.path.join(path,"auto-mpg.csv")
filename_write=os.path.join(path,"auto-mpg-out-of-sample.csv")
df=pd.read_csv(filename_read,na_values=['NA','?'])

#Shuffle
np.random.seed(42)
df=df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

#Preprocess
cars=df['name']
df.drop('name',1,inplace=True)
missing_median(df,'horsepower')
#Encode to 2d Matrix for trainig

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)



x,y=to_xy(df,'mpg')

#Cross Validate
kf=KFold(5)

oos_y=[]
oos_pred=[]
fold = 0
for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))


# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))    
    
# Write the cross-validated prediction
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
oosDF.to_csv(filename_write,index=False)

Fold #1
Epoch 00255: early stopping
Fold score (RMSE): 8.601637840270996
Fold #2
Epoch 00565: early stopping
Fold score (RMSE): 4.0072431564331055
Fold #3
Epoch 00448: early stopping
Fold score (RMSE): 4.947150707244873
Fold #4
Epoch 00031: early stopping
Fold score (RMSE): 15.435954093933105
Fold #5
Epoch 00406: early stopping
Fold score (RMSE): 3.91803240776062
Final, out of sample score (RMSE): 8.565091133117676


In [11]:
from sklearn import preprocessing
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64
# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode to a 2D matrix for training
species = encode_text_index(df,"species")
x,y = to_xy(df,"species")

# Cross-validate
kf = KFold(5)
    
oos_y = []
oos_pred = []
fold = 0

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
    
    x_train = x[train]
    y_train=y[train]
    x_test=x[test]  
    y_test=y[test]
    
    model=Sequential()
    model.add(Dense(50, input_dim=x.shape[1],activation='relu'))
    model.add(Dense(25,activation='relu'))
    model.add(Dense(y.shape[1],activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam')
    monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=25,verbose=1,mode='auto')
    
    model.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    pred=model.predict(x_test)
    oos_y.append(y_tset)
    pred=np.argmax(pred,axis=1)
    oos_pred.append(pred)
    
    #Measure accuracy
    y_compare=np.argmax(y_test,axis=1)
    score=metrics.accuracy_score(y_compare, pred)
    print("Fold score (accuracy) : {}".format(score))
    
oos_y=np.concatenate(oos_y)
oos_pred=np.concatenate(oos_pred)
oos_y_compare=np.argmax(oos_y,axis=1)

score=metrics.accuracy_score(oos_y_compare, oos_pred)
print("Final score (accuracy): {}".format(score))

oos_y=pd.DataFrame(oos_y)
oos_pred=pd.DataFrame(oos_pred)
oosDF = pd.concat([df, oos_y,oos_pred], axis=1)
oosDF.to_csv(filename_write,index=False)



KeyError: 'species'