In [1]:
# Stephen Cheney
# Homework 10

import numpy as np
from numpy import linspace, array, random, mean, std, round, floor, concatenate, diag
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import glob
from imageio import imread
from skimage.feature import greycomatrix, greycoprops

def readandcalctexturefeats(path,channel):

    # calculate four texture features for all files matching path, using a
    # specified channel if given (otherwise use channel 1)

    # assume that channel numbers provided to the function start with 1 (for compatibility with Matlab)
    chan = channel - 1

    listing = glob.glob(path)
#    print listing
    feats = np.zeros([len(listing),4]);
    i=0
    for filename in listing:
        img=imread(filename);
        GLCMS = greycomatrix(img[:,:,chan], [1], [0, np.pi/4, np.pi/2, 3*np.pi/4], normed=True)
# levels=128)
        stats = greycoprops(GLCMS,prop='contrast');
        feats[i,0]=mean(list(stats[0,:]))
        stats = greycoprops(GLCMS,prop='correlation');
        feats[i,1]=mean(list(stats[0,:]))
        stats = greycoprops(GLCMS,prop='energy');
        feats[i,2]=mean(list(stats[0,:]))
        stats = greycoprops(GLCMS,prop='homogeneity');
        feats[i,3]=mean(list(stats[0,:]))
        i = i + 1
    return listing,feats


def makeconfmat(trueclass,predclass):

    confmat = np.zeros([int(max(trueclass)),int(max(trueclass))]).astype(int)
    for i in range(0,int(len(trueclass))):
        confmat[int(trueclass[i]-1),int(predclass[i]-1)] = confmat[int(trueclass[i]-1),int(predclass[i]-1)] + 1

    return confmat




def dividefolds(X,y,foldind,ncross,icross):
    [npoints,nvar]=X.shape
    npoints2=len(y)
    if npoints!=npoints2:
        error('dimensions of X and y must match.')

    if(icross<=1):
        tind=np.zeros([npoints])-1
        # make sure each fold has approximately equal fraction of each class
        for i in range(int(min(y)),int(max(y)+1)):
            thisclass=np.extract(y==i, range(0,npoints))
            nthis=len(thisclass)
            if nthis > 0:
    # get a random ordering of the data points of this class
                rndtmp=random.permutation(thisclass)
                istart=0
                for j in range(1,ncross+1):
                    istop = int(floor(nthis*j/ncross))
                    tind[rndtmp[range(istart,istop)]]=j
                    istart=istop
        foldind=tind
#        print foldind
    rndind = np.extract(foldind==icross, range(0,npoints))
#    print rndind
    Xtest=X[rndind]
    ytest=y[rndind]

    rndind = np.extract(foldind!=icross, range(0,npoints))
    Xtrain=X[rndind]
    ytrain=y[rndind]

    return  Xtrain,ytrain,Xtest,ytest,foldind



def trainandtest(x,y,ncross):
    [npoints,nvar]=x.shape

    foldind=[None]

    for icross in range(1,ncross+1):

        [Xtrain,ytrain,Xtest,ytest,foldind]=dividefolds(x,y,foldind,ncross,icross)
        clf1 = LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None).fit(Xtrain, ytrain)

        predclass=clf1.predict(Xtest)

        if icross<=1:
           truth = ytest
           predictions = predclass
        else:
            truth = concatenate((truth, ytest))
            predictions = concatenate((predictions, predclass))

    confmat = makeconfmat(truth, predictions)

    accuracy = sum(diag(confmat))/sum(sum(confmat))

    return confmat, accuracy


def crossvalclassifier(x,y,ncross,ntrials):

#    print "Feature matrix"
#    print x
#    print "Class vector"
#    print y

    if isinstance(y,list):
        ycopy = np.asarray(y)
    else: #assume that it is already a numpy array
        ycopy = y

# initialize the random number generator to a known value
    random.seed(666)

# initialize the combined confusion matrix (for all trials) to zeros
    combconf = np.zeros([int(max(ycopy)),int(max(ycopy))])
    acc = np.zeros([ntrials])

    for i in range(0,ntrials):

        [confmat,accuracy]=trainandtest(x,ycopy,ncross)
# add the values for the new confusion matrix to the previous ones
        combconf=combconf + confmat
# remember the accuracy for this trial
        acc[i] = accuracy

    cmat = np.divide(combconf,ntrials)
    cmat = round(cmat,2)
    meanacc = mean(acc)
    stdacc = std(acc)

#    print "Confusion matrix"
#    print cmat
#    print "Mean accuracy=", meanacc, "Standard deviation of the accuracy=",stdacc

    return  cmat,meanacc,stdacc


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%!
cd /content/drive/MyDrive/CMPBIO250/HW10

['/bin/bash: line 1: cd: /content/drive/MyDrive/CMPBIO250/HW10: No such file or directory']

In [4]:
# calculate features from the image files for the two classes
from numpy import ones, concatenate, transpose
import numpy as np
import os
folder = "/content/drive/MyDrive/CMPBIO250/HW10" # set the folder path with the images
os.chdir(folder)

# fetch NCL images
[filenames1,feats1]=readandcalctexturefeats(folder+'/'+'NCL*.jpg',1);
print("Feature matrix for NCL protein")
print(feats1)
# fetch TFRC images
[filenames2,feats2]=readandcalctexturefeats(folder+'/'+'TFRC*.jpg',1);
print("Feature matrix for TFRC protein")
print(feats2)
# fetch TMTC images
[filenames3,feats3]=readandcalctexturefeats(folder+'/'+'TMTC*.jpg',1);
print("Feature matrix for TMTC2 protein")
print(feats3)
# fetch TRMT images
[filenames4,feats4]=readandcalctexturefeats(folder+'/'+'TRMT*.jpg',1);
print("Feature matrix for TRMT protein")
print(feats4)

# make a vector containing the class labels (1=NCL, 2=TFRC, 3=TMTC, 4=TRMT)
y=concatenate((np.repeat(1,feats1.shape[0]),np.repeat(2,feats2.shape[0]),np.repeat(3,feats3.shape[0]),np.repeat(4,feats4.shape[0])),0);
y=transpose(y)

# combine the feature matrices for the four classes
x=concatenate((feats1, feats2, feats3, feats4))

ntrials = 10; # set trials to 10
nfolds = 6 # six-fold
[cmat,meanacc,stdacc] = crossvalclassifier(x,y,nfolds,ntrials);
print('Confusion matrix')
print(cmat)
print('Average accuracy = ' + str(meanacc))

FileNotFoundError: ignored

##### Question 2 #####
a) 0.2 probability

b) Class 3 and 4 are the most confused with each other with a probability of 0.23 when 3 (TMTC) is true and 4 (TRMT) is predicted and 0.2 when 4 is true and 3 is predicted

c)

Confusion matrix

[5.3 0.6 0.  ]

[0.  5.1 0.7 ]

[0.  0.7 3.95]

Average accuracy = 0.7972

d) The average probability would be aroun 0.5 or 50%

In [None]:
##### Question 3 #####

import numpy as np
from numpy import linspace, array, random, mean, std, round, floor, concatenate, diag
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
import glob
from imageio import imread
from skimage.feature import greycomatrix, greycoprops

def readandcalctexturefeats(path,channel):

    # calculate four texture features for all files matching path, using a
    # specified channel if given (otherwise use channel 1)

    # assume that channel numbers provided to the function start with 1 (for compatibility with Matlab)
    chan = channel - 1

    listing = glob.glob(path)
#    print listing
    feats = np.zeros([len(listing),4]);
    i=0
    for filename in listing:
        img=imread(filename);
        GLCMS = greycomatrix(img[:,:,chan], [1], [0, np.pi/4, np.pi/2, 3*np.pi/4], normed=True)
# levels=128)
        stats = greycoprops(GLCMS,prop='contrast');
        feats[i,0]=mean(list(stats[0,:]))
        stats = greycoprops(GLCMS,prop='correlation');
        feats[i,1]=mean(list(stats[0,:]))
        stats = greycoprops(GLCMS,prop='energy');
        feats[i,2]=mean(list(stats[0,:]))
        stats = greycoprops(GLCMS,prop='homogeneity');
        feats[i,3]=mean(list(stats[0,:]))
        i = i + 1
    return listing,feats


def makeconfmat(trueclass,predclass):

    confmat = np.zeros([int(max(trueclass)),int(max(trueclass))]).astype(int)
    for i in range(0,int(len(trueclass))):
        confmat[int(trueclass[i]-1),int(predclass[i]-1)] = confmat[int(trueclass[i]-1),int(predclass[i]-1)] + 1

    return confmat




def dividefolds(X,y,foldind,ncross,icross):
    [npoints,nvar]=X.shape
    npoints2=len(y)
    if npoints!=npoints2:
        error('dimensions of X and y must match.')

    if(icross<=1):
        tind=np.zeros([npoints])-1
        # make sure each fold has approximately equal fraction of each class
        for i in range(int(min(y)),int(max(y)+1)):
            thisclass=np.extract(y==i, range(0,npoints))
            nthis=len(thisclass)
            if nthis > 0:
    # get a random ordering of the data points of this class
                rndtmp=random.permutation(thisclass)
                istart=0
                for j in range(1,ncross+1):
                    istop = int(floor(nthis*j/ncross))
                    tind[rndtmp[range(istart,istop)]]=j
                    istart=istop
        foldind=tind
#        print foldind
    rndind = np.extract(foldind==icross, range(0,npoints))
#    print rndind
    Xtest=X[rndind]
    ytest=y[rndind]

    rndind = np.extract(foldind!=icross, range(0,npoints))
    Xtrain=X[rndind]
    ytrain=y[rndind]

    return  Xtrain,ytrain,Xtest,ytest,foldind



def trainandtest(x,y,ncross):
    [npoints,nvar]=x.shape

    foldind=[None]

    for icross in range(1,ncross+1):

        [Xtrain,ytrain,Xtest,ytest,foldind]=dividefolds(x,y,foldind,ncross,icross)
        # Changed trainer to DecisionTreeClassifier
        clf1 = DecisionTreeClassifier(splitter='random', max_features="log2").fit(Xtrain, ytrain)

        predclass=clf1.predict(Xtest)

        if icross<=1:
           truth = ytest
           predictions = predclass
        else:
            truth = concatenate((truth, ytest))
            predictions = concatenate((predictions, predclass))

    confmat = makeconfmat(truth, predictions)

    accuracy = sum(diag(confmat))/sum(sum(confmat))

    return confmat, accuracy


def crossvalclassifier(x,y,ncross,ntrials):

#    print "Feature matrix"
#    print x
#    print "Class vector"
#    print y

    if isinstance(y,list):
        ycopy = np.asarray(y)
    else: #assume that it is already a numpy array
        ycopy = y

# initialize the random number generator to a known value
    random.seed(666)

# initialize the combined confusion matrix (for all trials) to zeros
    combconf = np.zeros([int(max(ycopy)),int(max(ycopy))])
    acc = np.zeros([ntrials])

    for i in range(0,ntrials):

        [confmat,accuracy]=trainandtest(x,ycopy,ncross)
# add the values for the new confusion matrix to the previous ones
        combconf=combconf + confmat
# remember the accuracy for this trial
        acc[i] = accuracy

    cmat = np.divide(combconf,ntrials)
    cmat = round(cmat,2)
    meanacc = mean(acc)
    stdacc = std(acc)

#    print "Confusion matrix"
#    print cmat
#    print "Mean accuracy=", meanacc, "Standard deviation of the accuracy=",stdacc

    return  cmat,meanacc,stdacc

In [None]:
# calculate features from the image files for the two classes
from numpy import ones, concatenate, transpose
import numpy as np
import os
folder = "/content/drive/MyDrive/CMPBIO250/HW10" # set the folder path with the images
os.chdir(folder)

# fetch NCL images
[filenames1,feats1]=readandcalctexturefeats(folder+'/'+'NCL*.jpg',1);
#print("Feature matrix for NCL protein")
#print(feats1)
# fetch TFRC images
[filenames2,feats2]=readandcalctexturefeats(folder+'/'+'TFRC*.jpg',1);
#print("Feature matrix for TFRC protein")
#print(feats2)
# fetch TMTC images
[filenames3,feats3]=readandcalctexturefeats(folder+'/'+'TMTC*.jpg',1);
#print("Feature matrix for TMTC2 protein")
#print(feats3)
# fetch TRMT images
[filenames4,feats4]=readandcalctexturefeats(folder+'/'+'TRMT*.jpg',1);
#print("Feature matrix for TRMT protein")
#print(feats4)

# make a vector containing the class labels (1=NCL, 2=TFRC, 3=TMTC, 4=TRMT)
y=concatenate((np.repeat(1,feats1.shape[0]),np.repeat(2,feats2.shape[0]),np.repeat(3,feats3.shape[0]),np.repeat(4,feats4.shape[0])),0);
y=transpose(y)

# combine the feature matrices for the four classes
x=concatenate((feats1, feats2, feats3, feats4))

ntrials = 10; # set trials to 10
nfolds = 6 # six-fold
[cmat,meanacc,stdacc] = crossvalclassifier(x,y,nfolds,ntrials);
print('\n\nConfusion matrix')
print(cmat)
print('Average accuracy = ' + str(meanacc))

3a) I chose the DecisionTreeClassifier, and kept options as default except for the splitter and max features. I changed the splitter to random to allow for a more branching tree to account for edge case differences, and the log2 in max features since it gave me a greater probability than an int of n_features.

3c) The accuracy is slightly worse than the other classifier. I believe this is because the decision tree is a linear method that cannot work in reverse to distinguish features. Therefore, it has less ability to distinguish between features and images overall.