In [261]:
# This is my file to run first ML on my actual dataset

## Prepare the data 
Read in the file

File format: 
x | y | z | label 
This should be prepared beforehand as this code only runs like that. No ID given or considered. 

(Note: I cannot read in the whole data set for whatever reason, it cannot see it even it is in the same directory)

In [262]:
import pandas as pd
import os
import numpy as np
import sklearn 
import matplotlib.pyplot as plt
import scipy.linalg as scplinag
from sklearn.neighbors import KDTree

In [263]:
# Define a data frame with all my data
FILE_PATH = r"../DATA"
FILE_NAME_SUBSET = r"/dataset_ML_CL_subset_1.csv"
FILE_NAME_FULL = r"/Cassette_GT.txt"
df = pd.read_csv(FILE_PATH+FILE_NAME_SUBSET, delimiter=',')
# df = pd.read_fwf("../DATA/Cassette_GT.txt")
df.head()

Unnamed: 0,//X,Y,Z,class
0,1903.382935,21122.49023,38.99118,303040192
1,1903.404541,21122.48633,38.990284,303040192
2,1903.428223,21122.48047,38.990547,303040192
3,1903.451538,21122.47656,38.991211,303040192
4,1903.474854,21122.4707,38.991467,303040192


In [264]:
# Rename the column 
df = df.rename(index=str, columns={"//X": "X"})

In [265]:
# Shows me the column headers
df.columns

Index([u'X', u'Y', u'Z', u'class'], dtype='object')

In [266]:
rows, cols = df.shape
print "#rows = instances:", rows

#rows = instances: 1048575


In [267]:
# Test whether there are any NAN values in the data set 
if df.isnull().values.any() == False:
    print "No Nan in this data set"

No Nan in this data set


## Now convert the dataframe into a numpy array dataset so we can work with it 
However, keep the df object for statistical analysis later 

In [268]:
# data is now a numpy ndarray with 1048575 rows and 4 cols 
data = df.values
data.shape

(1048575, 4)

In [269]:
# Define a subset to work with for now
# Only look at first 1000 points 
subset = data[:10000]

In [270]:
def getColumns(data, get_label = True):
    """
    INPUT: 
    data: dataset as np.array with 3 (xyz) or 4 (xyz label) attributes
    get_label: Boolean, whether or not to extract the label as well 
    OUTPUT: 
    xyz (optional: label) as numpy.array with dim: size of dataset x 1
    """
    # Get xyz values as column vectors 
    x = subset[:,0]
    y = subset[:,1]
    z = subset[:,2]
    
    if get_label == True:
        x = subset[:,0]
        y = subset[:,1]
        z = subset[:,2]
        label = subset[:,3]
        return x,y,z,label
    else: 
        return x,y,z

In [271]:
# These are the column vectors
x,y,z,label = getColumns(subset, True)
# This is the dataset in xyz only 
dataxyz = subset[:,0:3]

## Define all functions needed for the parameter retrieval 

In [272]:
def calcCovarianceMatrix(data):
    """
    Function to compute the covariance matrix.
    
    Input: Dataset of 3D points; i.e. array of dimension: #points x 3 
    Output: 3x3 covariance matrix (np.array)
    """
    # Create covariance matrix and array to store the mean values for x_mean, y_mean, z_mean
    C = np.zeros((data.shape[1], data.shape[1]))
    mean_xyz = []
    # Calculate all mean values
    for i in range(0, dataxyz.shape[1]):
        mean_xyz.append(dataxyz[:,i].mean())
    mean_xyz = np.array(mean_xyz)
    # Check whether dimensions agree 
    if dataxyz[:,0].size != dataxyz[:,1].size or dataxyz[:,0].size != dataxyz[:,2].size:
        print "X, Y and Z must be of same dimensions."
    else:
        # For each row in covariance matrix C
        for i in range(0, C.shape[0]):
            # For each column in covariance matrix C
            for j in range(0, C.shape[1]):
                C[i,j] = 0
                # For each point in the dataset, access x, y, z-values
                for point in dataxyz:
                    # For each point, access x,y and z in all combinations (xx, xy, xz, yx, yy, yz etc)
                    C[i][j] = C[i][j] + (point[i]-mean_xyz[i])*(point[j]-mean_xyz[j])
    # Divide by the total number of points                
    C = (1.0/dataxyz.shape[0]) * C
    return C 

In [273]:
# Get eight parameters for each point

def calcFeatureDescr(covarianceMatrix):
    """
    Function to compute the 8 feature descriptors for each point.
    
    Input: 3x3 Covariance matrix of a point and its neighbourhood 
    
    Output: np Array with feature descriptors as described by Weinmann et al. (1D array with 8 elements)
    
    """
    D, V = scplinag.eigh(C)
    # We sort the array with eigenvalues by size (from smallest to largest value)
    D.sort()
    # Get eigenvectors
    e1 = V[2] # eigenvector in direction of largest variance
    e2 = V[1] # second eigenvector, perpend. to e1
    e3 = V[0]
    # Find the eigenvalues
    evalue1 = D[2] # largest
    evalue2 = D[1]
    evalue3 = D[0] # smallest

    # Linearity
    lambda1 = (evalue1 - evalue2) / evalue1
    # Planarity
    lambda2 = (evalue2 - evalue3) / evalue1
    # Scattering
    lambda3 = evalue3 / evalue1
    # Omnivariance
    lambda4 = pow(evalue1*evalue2*evalue3, 1/3.0)
    # Anisotropy
    lambda5 = (evalue1 - evalue3) / evalue1
    # Eigentropy
    s = 0
    for elem in D:
        s = s + (elem*np.log(elem))
    lambda6 = (-1)*s  
    # Sum of eigenvalues
    lambda7 = sum(D)
    # Change of curvature
    lambda8 = evalue3/sum(D) 
    
    featureDescriptor = np.array([lambda1, lambda2, lambda3, lambda4, lambda5, lambda6, lambda7, lambda8])
    return featureDescriptor

## Compute features

In [275]:
# For all points now 
# Create kd-tree
kdt = KDTree(dataxyz, leaf_size=30, metric='euclidean')
# Get list with indices, the first value is always the point itself
idx_list = kdt.query(dataxyz, k=100, return_distance=False)
store = []
for j in range(0, dataxyz.shape[0]):
    # Look at all points now
    neighbourhood = []
    for i in idx_list[j]:
        neighbourhood.append(dataxyz[i])
    neighbourhood = np.array(neighbourhood)
    # Everything we did before with dataset, we do now with the neighbourhood only
    C = calcCovarianceMatrix(neighbourhood)
    feat = calcFeatureDescr(C)
    row_with_additional_col = np.append(dataxyz[j], feat)
    store.append(row_with_additional_col)
store = np.array(store)

In [276]:
store.shape

(10000, 11)

## Analyse and export features 
Convert the 8 features I just created into dataframe in pandas to analyse them better 

In [277]:
# Create a data frame with the calculated features 
df2 = pd.DataFrame({
    'X': store[:,0],
    'Y': store[:,1],
    'Z': store[:,2],
    'lambda1': store[:,3],
    'lambda2': store[:,4],
    'lambda3': store[:,5],
    'lambda4': store[:,6],
    'lambda5': store[:,7],
    'lambda6': store[:,8],
    'lambda7': store[:,9],
    'lambda8': store[:,10]
})
df2.head()

Unnamed: 0,X,Y,Z,lambda1,lambda2,lambda3,lambda4,lambda5,lambda6,lambda7,lambda8
0,1903.382935,21122.49023,38.99118,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05
1,1903.404541,21122.48633,38.990284,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05
2,1903.428223,21122.48047,38.990547,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05
3,1903.451538,21122.47656,38.991211,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05
4,1903.474854,21122.4707,38.991467,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05


In [278]:
df2.to_csv(FILE_PATH+'/dataset_ML_CL_subset_1_prep.csv')

In [279]:
# Also create a df where I add a label to the table 
df2['class'] = label.astype(int)

In [280]:
df2.head()

Unnamed: 0,X,Y,Z,lambda1,lambda2,lambda3,lambda4,lambda5,lambda6,lambda7,lambda8,class
0,1903.382935,21122.49023,38.99118,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05,303040192
1,1903.404541,21122.48633,38.990284,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05,303040192
2,1903.428223,21122.48047,38.990547,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05,303040192
3,1903.451538,21122.47656,38.991211,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05,303040192
4,1903.474854,21122.4707,38.991467,0.604788,0.395171,4.1e-05,2.054374,0.999959,-467.976464,113.227327,2.9e-05,303040192


In [281]:
df2.to_csv(FILE_PATH+'/dataset_ML_CL_subset_1_prep_class.csv')

In [307]:
# Create a data frame with the calculated features 
df3 = pd.DataFrame({
    'lambda1': store[:,3],
    'lambda2': store[:,4],
    'lambda3': store[:,5],
    'lambda4': store[:,6],
    'lambda5': store[:,7],
    'lambda6': store[:,8],
    'lambda7': store[:,9],
    'lambda8': store[:,10]
})
df3['class'] = label.astype(int)
df3.head()
df3.to_csv(FILE_PATH+'/dataset_ML_CL_subset_1_prep_class_noxyz.csv')

In [303]:
# Tells me how many differnt classes are present in the subset I chose 
df2['class'].value_counts()

203000000    4744
202020000    4025
303040192    1110
202030000      84
202040000      37
Name: class, dtype: int64

## Summary so far 
Steps: 
1) Read data in
2) prepare data: subset, columns 
3) convert to numpy
4) Get features
**Note: here only for 100 points so far (from a subset anyway, this is not the whole data file)**

For the ML part now, work with: 
**Dataframes:** df2 either with class or without 
**Numpy array data:** store

## ML Part 
From here on, try ML 

In [283]:
# Just rename it for more convenience
# This is the data with xyz + 8 features + class (last entry)
data_ML = df2.values

In [284]:
data_ML.shape

(10000, 12)

In [285]:
data_ML[0]

array([ 1.90338294e+03,  2.11224902e+04,  3.89911804e+01,  6.04788148e-01,
        3.95170802e-01,  4.10500520e-05,  2.05437416e+00,  9.99958950e-01,
       -4.67976464e+02,  1.13227327e+02,  2.94212268e-05,  3.03040192e+08])

In [286]:
# Function to get the number of x % of a dataset 
def getXpercentOfDataset(dataset, percent):
    m = (dataset.shape[0]*percent)/100.0
    n = (dataset.shape[0]*(100-percent))/100.0
    return int(m), int(n)

In [287]:
# Stratified samppling does not work for whatever reason 
# I just use random sampling and shuffle the data 

m, n = getXpercentOfDataset(data_ML, 20)
print m, n

X, y = data_ML[:,3:11], data_ML[:,11:12].astype(int)

X_train, X_test, y_train, y_test = X[m:], X[:m], y[m:], y[:m]

shuffle_index = np.random.permutation(n)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

2000 8000


In [288]:
X_train.shape

(8000, 8)

In [289]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [290]:
print "Predicted value:", clf.predict([X_train[50]])
print "True label:", y_train[50]
if clf.predict([X_train[50]]) == y_train[50]:
    print "Successful"
else: 
    print "Classification failed"

Predicted value: [203000000]
True label: [203000000]
Successful


## Evaluation of my calssification result 

In [291]:
# Get the accuracy in % for a classification result
# My function to just count the number of times I got it right 
def getAccuracy(predictions, labels):
    """
    INPUT: predicted values of a classifier (test set); true labels of that test data set
    OUTPUT: accuracy in %
    """
    count = 0
    for i in range(0, labels.shape[0]):
        if predictions[i] == labels[i]:
            # each time the prediction matches with the actual label, the counter goes up 1 
            count = count + 1
    count = float(count)
    acc = (count/labels.shape[0])*100
    
    return acc

In [292]:
p_train = clf.predict(X_train)
p_test = clf.predict(X_test)

In [293]:
# This is the accuracy for my training data set 
print "This is the accuracy for my training dataset:", getAccuracy(p_train, y_train), "%"
print "This is the accuracy for my testing dataset:", getAccuracy(p_test, y_test), "%"

This is the accuracy for my training dataset: 49.0125 %
This is the accuracy for my testing dataset: 41.15 %
