In [26]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA # Principle Component Analysis
from sklearn.preprocessing import StandardScaler # Feature Scaling

In [7]:
"""Unsupervised techniques are often used in the analysis of genomic data.
In particular, PCA and hierarchical clustering are popular tools. We illustrate
these techniques on the NCI60 cancer cell line microarray data, which
consists of 6,830 gene expression measurements on 64 cancer cell lines."""
x = pd.read_csv('NCI60_X.csv', index_col=0)
y = pd.read_csv('NCI60_Y.csv', index_col=0)

In [11]:
"""Each cell line is labeled with a cancer type. We do not make use of the
cancer types in performing PCA and clustering, as these are unsupervised
techniques. But after performing PCA and clustering, we will check to
see the extent to which these cancer types agree with the results of these
unsupervised techniques."""

print(x.shape, y.shape)

(64, 6830) (64, 1)


In [12]:
x[:5]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,6821,6822,6823,6824,6825,6826,6827,6828,6829,6830
V1,0.3,1.18,0.55,1.14,-0.265,-0.07,0.35,-0.315,-0.45,-0.654981,...,-0.990019,0.0,0.03,-0.175,0.629981,-0.03,0.0,0.28,-0.34,-1.93
V2,0.679961,1.289961,0.169961,0.379961,0.464961,0.579961,0.699961,0.724961,-0.040039,-0.28502,...,-0.270058,-0.300039,-0.250039,-0.535039,0.109941,-0.860039,-1.250049,-0.770039,-0.390039,-2.000039
V3,0.94,-0.04,-0.17,-0.04,-0.605,0.0,0.09,0.645,0.43,0.475019,...,0.319981,0.12,-0.74,-0.595,-0.27002,-0.15,0.0,-0.12,-0.41,0.0
V4,0.28,-0.31,0.68,-0.81,0.625,-1.387779e-17,0.17,0.245,0.02,0.095019,...,-1.24002,-0.11,-0.16,0.095,-0.35002,-0.3,-1.15001,1.09,-0.26,-1.1
V5,0.485,-0.465,0.395,0.905,0.2,-0.005,0.085,0.11,0.235,1.490019,...,0.55498,-0.775,-0.515,-0.32,0.63498,0.605,0.0,0.745,0.425,0.145


In [23]:
y[:5]

Unnamed: 0,x
1,CNS
2,CNS
3,CNS
4,RENAL
5,BREAST


In [24]:
# examining the cancer types for the cell lines
y['x'].value_counts()

RENAL          9
NSCLC          9
MELANOMA       8
COLON          7
BREAST         7
OVARIAN        6
LEUKEMIA       6
CNS            5
PROSTATE       2
K562B-repro    1
MCF7A-repro    1
UNKNOWN        1
MCF7D-repro    1
K562A-repro    1
Name: x, dtype: int64

### PCA on the NCI60 Data

6830

In [55]:
# Before scaling
# Let's have a look at average and variance of features
%matplotlib notebook
fig, axe = plt.subplots(2,1, figsize=(5,4))
axe[0].plot(range(x.shape[1]), x.mean())
axe[0].set_title('Mean')

axe[1].plot(range(x.shape[1]), x.var())
axe[1].set_title('Variance')

plt.tight_layout()

<IPython.core.display.Javascript object>

In [67]:
# Scaling
# Feature Scaling
sc = StandardScaler()
x_sc = sc.fit_transform(x.values)

df_sc = pd.DataFrame(data = x_sc, columns = x.columns)

# Mean and variance after scaling
%matplotlib notebook
fig, axe = plt.subplots(2,1, figsize=(5,4))
axe[0].plot(range(x.shape[1]), df_sc.mean())
axe[0].set_title('Mean')

axe[1].plot(range(x.shape[1]), df_sc.var())
axe[1].set_title('Variance')

plt.tight_layout()

<IPython.core.display.Javascript object>

In [68]:
#We now plot the first few principal component score vectors, in order to visualize the data.