In [19]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.


In [20]:
print(__doc__)


# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

Automatically created module for IPython interactive environment


In [21]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd 
from glob import glob

h = .02  # step size in the mesh

In [22]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

In [23]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [24]:
path= "../../Dataset"
def folder_finder(path):

    file_list = []

    for j in glob(path+"*.csv"):
        file_list.append(j)

    return file_list
    print(file_list)
    print('Total files:',len(file_list))

In [25]:
# read csv file
def read_file(path):

    data = pd.read_csv(path)
    return data
    data.head()

In [26]:
# pandas to numpy 
def pd_to_np(data):
    
    if type(data) == np.ndarray:
      print('Data is already in numpy format!')
    else:
      data = data.values
      #print('Pandas to Numpy done!')

    return data

In [27]:
# string to id
def string_to_index(activity_label):

    har_class={} 
    activity_class =[]

    activity_label = pd_to_np(activity_label)
    unique, counts = np.unique(activity_label, return_counts=True)

    # string to index dict
    for i in range(len(unique)):
        har_class[unique[i]]=i

    # activity class tranform into indexes
    for i in range(len(activity_label)):
        activity_class.append(har_class[activity_label[i][0]])

    return activity_class

In [28]:
def dataset_info(file_list):

    for file in file_list:

        X_train, X_test, y_train, y_test = dataset(file)
        print("File: ",file)
        print("Size: ",y_train.shape[0])
        print("X_train: ", X_train.shape)
        print("y_train: ", y_train.shape)

        print("X_test: ", X_test.shape)
        print("y_test: ", y_test.shape)

In [29]:
def data_loader(path, split=0.3):
    
    
    X_train =  X_test =  y_train =  y_test = []
    x = y = []
    pd_data = read_file(path)


    feature_list = []

    for i in pd_data:
        feature_list.append(i)

  
    selectData = pd_data.loc[:, feature_list[:-1]]
    activityLabel = pd_data.loc[:, ['activity']]


    x = pd_to_np(selectData)
    x = StandardScaler().fit_transform(x)
    y = string_to_index(activityLabel)

    # y = activityClass
    y = np.asarray(y) 
    y = y.astype('int32')

    #print(y)

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = split,random_state=42)

    # Feature Scaling
    # sc = StandardScaler()
    # X_train = sc.fit_transform(X_train)
    # X_test = sc.transform(X_test)

    return X_train, X_test, y_train, y_test

In [30]:
def get_args():

    parser = argparse.ArgumentParser()

    # parser.add_argument('-gpu-id', type=int, default=0)

    # dataset
    parser.add_argument('-data_dir',type=str,default='../Dataset/')
    parser.add_argument('-pretrain',type=bool,default=False)

    # parser.add_argument('-epoch', type=int, default=100)
    # parser.add_argument('-lr', type=float, default=0.001)
    # parser.add_argument('-use-cuda', default=True, action='store_true')

    return parser.parse_args()

In [31]:
    file_list = folder_finder("../../Dataset")
    print(file_list)
    print('Total files:',len(file_list))


    #model_arch = model_init()
    #save_model_name = "pretrain/"+datetime.now().strftime('time_%H_%M_%S__date_%Y-%m-%d')+".pkl"

    # if args.pretrain:
    #     print('Loading pretrain model...')
    #     model = pickle.load(open(save_model_name, 'rb'))

    x_test_bulk = []
    y_test_bulk = []

    for i in range(len(file_list)):

        X_train, X_test, y_train, y_test = data_loader(file_list[i])

        model = train_model(model_arch, X_train, y_train)
        pickle.dump(model, open(save_model_name, 'wb'))

        x_test_bulk.append(X_test)
        y_test_bulk.append(y_test)
        # pred_tree = test_model(model, X_test)
        # model_evalution(y_test,pred_tree)
        #dataset_info(file_list)

    for i in range(len(y_test_bulk)):
        model = pickle.load(open(save_model_name, 'rb'))
        pred_tree = test_model(model, x_test_bulk[i])
        model_evalution(y_test_bulk[i],pred_tree)

[]
Total files: 0


In [32]:
#file_path= "../../Dataset/dataset_1.csv"
#dataframe = pd.read_csv(file_path)
#X, y = 
#rng = np.random.RandomState(2)
#X += 2 * rng.uniform(size=X.shape)
#linearly_separable = (X, y)

In [33]:

###datasets.head()###
##datasets = [make_moons(noise=0.3, random_state=0),
            ##make_circles(noise=0.2, factor=0.5, random_state=1),
            ##linearly_separable]
#data(noise=0.3, random_state=0)#,
##list to dataframe
#dataframe = pd.DataFrame(datasets) 
#dataframe.head()
#dataframe to list
#datasets = dataframe.values.tolist()

In [34]:
figure = plt.figure(figsize=(27, 9))
i = 1

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
               edgecolors='k')
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
               edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    """"for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   edgecolors='k', alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1"""

plt.tight_layout()
plt.show()

IndentationError: unexpected indent (<ipython-input-34-8f147807b100>, line 5)