# Opioid Data
HW #2 Part 1 - Dimensionality Reduction.  
Use one summary vector per patient, specifically the monthly average per patient.  

## Preprocessing
Patient files are in one of two directories: R or NR.  
Each patient is represented by one CSV file.  
Each row of each CSV contains readings from one day.    
Here, we load each patient average across all days.   

In [1]:
from os import listdir
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from matplotlib import colors
mycmap = colors.ListedColormap(['red','blue'])  # list color for label 0 then 1
np.set_printoptions(precision=2)

In [2]:
pathR='data/ChunkedData_R/'
pathN='data/ChunkedData_NR/'
filesR = listdir(pathR)
filesN = listdir(pathN)
print(len(filesR)," files of type R")
print(len(filesN)," files of type N")
CLASS_SEPARATOR=13  # data[:13] vs data[13:]

14  files of type R
26  files of type N


In [3]:
# Read one CSV file. 
# Create a Pandas data frame.
# Drop the date column.
def file_mean (filepath):
    mydata = pd.read_csv(filepath)
    # Drop the date column.
    mydata = mydata.drop('Date',axis=1) 
    # Transpose column of mean values into a row.
    mymean = mydata.mean(axis=0).to_frame().T
    myvar  = mydata.var (axis=0).to_frame().T
    return mymean,myvar

In [4]:
# Read directory of CSV files (R or NR). 
# Create one dataframe representing all files.
# Retain only one row per file = column averages.
def mean_per_file (directory,variance):
    files = listdir(directory)
    means = pd.DataFrame()
    for fp in files:
        dfp = directory+fp
        m = file_mean(dfp)
        # Let Pandas number the rows sequentially.
        means = means.append(m,ignore_index=True)
    return means

In [5]:
meansR = mean_per_file(pathR,True)
meansR.describe()

TypeError: mean_per_file() takes 1 positional argument but 2 were given

In [None]:
meansN = mean_per_file(pathN,True)
meansN.describe()

In [None]:
# Combine all data (R and NR) into one data frame.
# Label R = positive = 1 = blue.
# Label NR = negative = 0 = red.
def make_labels(positives,negatives):
    rows = positives.shape[0]
    labelsP = pd.DataFrame(np.ones(rows,dtype=np.int8))  # one = blue
    rows = negatives.shape[0]
    labelsN = pd.DataFrame(np.zeros(rows,dtype=np.int8))  # zero = red
    labelsAll = pd.concat((labelsP,labelsN),ignore_index=True)
    return labelsAll

In [None]:
meansAll = pd.concat((meansR,meansN),ignore_index=True)
meansAll.shape
labels_df = make_labels(meansR,meansN)
print(labels_df.T)
meansR = None
meansN = None

## Scaling and covariance
Normalize by subtracting the column mean from every column value.  
Since columns have widely different numerical ranges,   
also normalize by making each column have unit variance.  
Note: without normalization, the covariance plot would be all black except for the few features with large absolute values.

In [None]:
# Standardize features by shifting the mean to zero and scaling to unit variance.
def scale_features(X):
    s = StandardScaler()
    z = s.fit_transform(X)
    return z
scaledMeans = scale_features(meansAll)

In [None]:
def plot_covariance(X):
    cv=X.T.dot(X)/len(X)
    plt.imshow(cv, cmap='hot', interpolation='nearest')
    plt.show()
plot_covariance(scaledMeans)

## Principal Component Analysis (PCA)
PCA is an example of unsupervised learning.
PCA does dimensionality reduction by a linear transformation
to orthogonal axes where each successive axis 
captures most of the remaining variance.  

It is important to normalize first.
Otherwise, most variance will be explained by those columns with large absoulte values.
That was our mistake in HW #1, when 67% of variance was explained by PC1.  
We will examine prinicpal components 1, 2, 3.

In [None]:
def show_PC_variance_explained(model):
    # Assumes at least 3 principal components, or else crashes.
    e1,e2,e3=model.explained_variance_ratio_[:3]*100.0
    print("Variance explained by PC1=%.2f%% PC2=%.2f%% PC3=%.2f%%"%(e1,e2,e3))
def show_PC_eigenvalues(model):
    e1,e2,e3=model.lambdas_[:3]
    print("Eigenvalues EV1=%.2f EV2=%.2f EV3=%.2f"%(e1,e2,e3))
def construct_PCA():
    # Use the PCA class from sklearn.
    # Linear dimensionality reduction using Singular Value Decomposition.
    # This is unsupervised learning (but we'll use labels for visualization).
    # Every transform returns a COPY of the data; see copy parameter.
    # Does centering (setting mean = 0); no way to disable this.
    # Does not do scaling (setting variance = 1); no way to enable this.
    # Does NOT do whitening (setting variance = covariance = 1); see whitening parameter.
    # When n_components = None, PCA uses min(features,instances).
    # Can also be set to 'mle' or min percent of variance to explain.
    model = PCA()
    return model

In [None]:
print("PCA of total feature space:")
pca=construct_PCA()
pca.fit(scaledMeans)
show_PC_variance_explained(pca)
P_transformed=pca.fit_transform(scaledMeans)
print("Shape of transformed data",P_transformed.shape)

## Linear Discriminant Analysis (LDA)
LDA is a classification algorithm. 
LDA is an example of supervised learning.
LDA finds a linear decision boundary.

LDA can be used for dimensionality reduction.
It projects the data onto some number of axes 
that most discriminate between the classes.
The maximum number of dimensions is n_classes-1. 

In [None]:
def construct_LDA():
    # A classifier with a linear decision boundary, 
    # generated by fitting class conditional densities 
    # to the data and using Bayes’ rule.
    # Assumes a Gaussian density for each class.
    # Assumes that all classes share the same covariance matrix.
    # Solver = 'svd' (default), 'lsqr', or 'eigen'.
    # By default, priors are inferred from inputs.
    # By default, n_components = None, and LDA uses min(features,classes-1).  
    # It is possible to ask LDA for the means, classes, priors,
    # variance explained, decision boundary line, and within-class covariance.
    return LinearDiscriminantAnalysis()

In [None]:
# Must use ravel() to convert shape (40,1) to (40,).
labels_ravel=np.ravel(labels_df)
lda=construct_LDA()
training = cross_validate(lda, scaledMeans, labels_ravel, cv=5)
training['test_score']

In [None]:
print("LDA of total feature space:")
lda.fit(scaledMeans,labels_ravel)
# The linear decision boundary is a (n_features)-dimensional vector.
print("Shape of decision boundary",lda.coef_.shape)
L_transformed = lda.transform(scaledMeans)
# The transformed data is (n_classes-1)-dimensional i.e. 1D.
print("Shape of transformed data",L_transformed.shape)

## HW2.1.a: Comparison of PCA vs LDA
The LDA has more information, specifically, the labels.
Also, since we have only two classes, the LDA must perform maximum separation on one axis.
Thus, it is not surprising that the LDA achieves greater separation 
compared to the first principal component.

In [None]:
fig1 = plt.figure(figsize=(10,2)) 
ax1 = fig1.add_subplot(121)
ax1.set_title('PCA first axis')
# Extract list of PC1 coordinate for each of 40 intances.
pc1_coords = [c[0] for c in P_transformed]
ax1.hist(pc1_coords[:CLASS_SEPARATOR],histtype='step',color='blue') 
ax1.hist(pc1_coords[CLASS_SEPARATOR:],histtype='step',color='red')  
ax2 = fig1.add_subplot(122)
ax2.set_title('LDA first axis')
ax2.hist(L_transformed[:CLASS_SEPARATOR],histtype='step',color='blue') 
ax2.hist(L_transformed[CLASS_SEPARATOR:],histtype='step',color='red')  
plt.show()

## HW2.1.b: Discussion of PCA

In [None]:
def plot_PCA (transformed,labels):  
    lims=[-5,5]  #  Use this if defaults don't work:   ax1d.set_xlim(lims)
    fig = plt.figure(figsize=(10,3))
    #pc1,pc2,pc3 = np.split(transformed,3,axis=1)  # crashes if n_components != 3
    pc1 = [c[0] for c in transformed]
    pc2 = [c[1] for c in transformed]
    pc3 = [c[2] for c in transformed]
    # Plot first PC as histogram
    ax1d = fig.add_subplot(131)
    ax1d.set_title('First PC')
    ax1d.hist(pc1[:CLASS_SEPARATOR],histtype='step',color='blue')  
    ax1d.hist(pc1[CLASS_SEPARATOR:],histtype='step',color='red')  
    # Plot first 2 PCs in 2D. 
    ax2d = fig.add_subplot(132)
    ax2d.set_title('First 2 PCs')
    ax2d.scatter(pc1,pc2,c=labels,cmap=mycmap)
    # Plot first 3 PCs in 3D.
    ax3d = fig.add_subplot(133,projection='3d')
    ax3d.set_title('First 3 PCs')
    ax3d.scatter(pc1,pc2,pc3,c=labels,cmap=mycmap)
    # Output to screen.
    plt.show()
plot_PCA(P_transformed,labels_df)

In [None]:
def plot_scree(model):
    pc_values = np.arange(model.n_components_) + 1
    plt.rcParams["figure.figsize"] = (10,5)
    plt.plot(pc_values, model.explained_variance_ratio_, 'ro-', linewidth=2)
    plt.title('Scree Plot')
    plt.xlabel('Principal Component')
    plt.ylabel('Proportion of Variance Explained')
    plt.grid()
    plt.show()
plot_scree(pca)

In [None]:
print("Principal components (count,dimensions):",pca.components_.shape)

print("Additional variance explained, per principal component (first 10):")
for pc in range(0,10):
    print("%d=%.3f "%(pc+1,pca.explained_variance_ratio_[pc]),end=" ")
print()

print("Cumulative variance explained, per principal component (first 10):")
sum = 0.0
for pc in range(0,10):
    sum += pca.explained_variance_ratio_[pc]
    print("%d=%.3f "%(pc+1,sum),end=" ")
print()

def print_top_ten_features(pc,cols,cmps):
    NUM=10
    print("PC%d top ten features"%pc)
    n_features=len(cols)
    mylist=[]
    for i in range(0,n_features):
        name = cols[i]
        weight = cmps[pc-1][i]
        triple = (i,name,np.abs(weight))   
        mylist.append(triple)
    myary = np.array(mylist,dtype=[('index',int),('name','S30'),('weight',float)])
    sorted = np.sort(myary,order='weight')
    for i in range(1,NUM+1):
        val = sorted[-i]  # start at end and work to front
        print("  # %3i (%30s) %7.4f"%(val[0],val[1],val[2]))
print_top_ten_features(1,meansAll.columns,pca.components_)
print_top_ten_features(2,meansAll.columns,pca.components_)
print_top_ten_features(3,meansAll.columns,pca.components_)

In [None]:
meansAll.columns[1]