In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

class dataloader():
    def __init__(self, filename, data_name, header='infer'):
        self.data_name = data_name
        self.data = pd.read_csv(filename, header=header)
        self.clean()
        self.X = self.y = None
        self.X_train = self.X_test = self.y_train = self.y_test = None
        
    
    def clean(self):
        if self.data_name == 'kaggle_cat':
            self.data = self.data.drop('id', axis=1)
    
    def get_input_target(self, supervised=True):
        dataset = self.data.values
        if supervised:
            self.X = dataset[:, :-1].astype(str)
            self.y = dataset[:,-1]
            self.y.reshape((len(self.y), 1))
        else:
            self.X = dataset
             
    
    def test_train_split(self, X, y, test_size=0.33, random_state=1):
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
            sss.get_n_splits(X, y)
            for train_index, test_index in sss.split(X, y):
                #print("TRAIN:", train_index, "TEST:", test_index)
                self.X_train, self.X_test = X[train_index], X[test_index]
                self.y_train, self.y_test = y[train_index], y[test_index]

In [79]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
#https://datascience.stackexchange.com/questions/39317/difference-between-ordinalencoder-and-labelencoder
#check this out for ordinal vs label

class encoding():
    ## encode inputs: nothing -> X, y
    ## encode target: (default label)
    ## self.data
    def __init__(self):
        #self.data = data
        self.encoder = None
    
class label(encoding):
    
    def __init__(self):
        #super().__init__(data)
        self.encoder = [] #Save encoders in the object for later use
        self.X_train = self.X_test = None
        
    def encode_inputs(self, X, mode):
        X_enc = []
        for e, col in enumerate(X.T):
            if mode == "train":
                #self.encoder = [LabelEncoder() for i in range(X.shape[1])]
                encoder_col = LabelEncoder()
                encoder_col.fit(list(col) + ['Unk']) #handle unkown labels for test
                x_enc = encoder_col.transform(col)
                self.encoder.append(encoder_col)
                X_enc.append(x_enc)
            else:
                new_col = list(col)
                for unique_item in np.unique(col):
                    if unique_item not in self.encoder[e].classes_:
                        new_col = ['Unk' if x==unique_item else x for x in new_col]
                X_enc.append(self.encoder[e].transform(new_col))
        return np.vstack(X_enc).T  
    
class ordinal(encoding):
    
    def __init__(self):
        #super().__init__(data)
        self.encoder = OrdinalEncoder()
        self.X_train = self.X_test = None
        
    def encode_inputs(self, X):
        return self.encoder.fit_transform(X)
    
class OHE(encoding):
    
    def __init__(self, sparse=True):
        #super().__init__(data)
        self.encoder = OneHotEncoder(sparse=sparse, handle_unknown="ignore")
        self.X_train = self.X_test = None
        
    def encode_inputs(self, X, mode):
        if mode == "train":
            return self.encoder.fit_transform(X)
        else:
            return self.encoder.transform(X)
    
#Why should data be in parent class? do we need it?

# class target(encoding)

# class EE(encoding)

# class autoencoder(encoding)

### Kaggle Dataset

In [73]:
kc_train = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
kc_train.get_input_target()
kc_test = dataloader('data/kaggle_cat_train.csv', "kaggle_cat")
kc_test.get_input_target()

In [74]:
kag_label = label()
kag_label.X_train = kag_label.encode_inputs(kc_train.X, mode="train")
kag_label.X_test = kag_label.encode_inputs(kc_test.X, mode="test")

In [75]:
kag_ohe = OHE(sparse=True)
kag_ohe.X_train = kag_ohe.encode_inputs(kc_train.X, mode="train")
kag_ohe.X_test = kag_ohe.encode_inputs(kc_test.X, mode="test")

### Breast Cancer Dataset

In [76]:
bc = dataloader('data/breast_cancer.csv', "breast_cancer", header=None)
bc.get_input_target()
bc.test_train_split(bc.X, bc.y)

In [77]:
bc_label = label()
bc_label.X_train = bc_label.encode_inputs(bc.X_train, mode="train")
bc_label.X_test = bc_label.encode_inputs(bc.X_test, mode="test")

In [78]:
bc_ohe = OHE(sparse=True)
bc_ohe.X_train = kag_ohe.encode_inputs(kc_train.X, mode="train")
bc_ohe.X_test = kag_ohe.encode_inputs(kc_test.X, mode="test")

### Insights Dataset

In [80]:
insights = dataloader('data/Insights/insights.csv', "insights")
insights.get_input_target(supervised=False)

In [81]:
ins_label = label()
insights.X_enc = ins_label.encode_inputs(insights.X.astype(str), mode="train")

In [82]:
ins_ohe = OHE(sparse=True)
ins_ohe.X_enc = ins_ohe.encode_inputs(insights.X.astype(str), mode="train")

In [None]:
class model
#Think of how to structure this

In [None]:
### Experiments

In [1]:
# load and summarize the dataset


# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# load the dataset
X, y = load_dataset('data/breast_cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (191, 9) (191, 1)
Test (95, 9) (95, 1)


In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

In [3]:

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [4]:
# prepare target
def prepare_targets_ordinal(y_train, y_test):
	le = OrdinalEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [5]:
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
#y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
y_train_enc, y_test_enc = prepare_targets_ordinal(y_train, y_test)

In [6]:
### Credits: https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/

In [7]:
# define the  model
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=0)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Using TensorFlow backend.


Accuracy: 71.58


In [31]:
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=0)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 70.53
