In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn

%matplotlib inline

In [None]:
# Simply read in the data
column_labels = ['patient_id', 'diagnosis', 'mean radius', 'mean texture', 'mean perimeter', 'mean area','mean smoothness', 'mean compactness', 'mean concavity','mean concave points', 'mean symmetry', 'mean fractal dimension','radius error', 'texture error', 'perimeter error', 'area error','smoothness error', 'compactness error', 'concavity error','concave points error', 'symmetry error','fractal dimension error', 'worst radius', 'worst texture','worst perimeter', 'worst area', 'worst smoothness','worst compactness', 'worst concavity', 'worst concave points','worst symmetry', 'worst fractal dimension']

# Note: Might need to change this path to the data
df = pd.read_csv("../data/wdbc.data", names=column_labels, header=None)

In [None]:
# Show it just to be safe
df.head(3)

In [None]:
# Does exactly what it looks like
df["diagnosis"].replace("B", "Benign", inplace=True)
df["diagnosis"].replace("M", "Malignant", inplace=True)

df.head(3)

In [None]:
# Creates two df's one with just the features, and one with all the labes (correct values) that we associate it with

feature_df = df.iloc[:, 2:]
label_df = df.iloc[:, 1]

In [None]:
# Scales all the columns togeather,  so they're within the same range
from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(feature_df)
x

In [None]:
# Double check
np.mean(x), np.std(x)

In [None]:
# Now we can use PCA
from sklearn.decomposition import PCA

# n components = 2 because we want to plot it in 2d. 
pca = PCA(n_components=2) # Just creates an instance of a PCA
pca_data = pca.fit_transform(x) # Adds the data
pca_df = pd.DataFrame(data=pca_data, columns=["PC1", "PC2"]) # creates a df with that data

In [None]:
# This show how much of the variance is explained by each principal component (sp?)
pca.explained_variance_ratio_

In [None]:
plt.figure()
plt.xlabel("PC 1")
plt.ylabel("PC 2")

targets = ["Benign", "Malignant"]
colors = ["b", "r"]

for target, color in zip(targets, colors):
        indToKeep = label_df == target
        plt.scatter(pca_df.loc[indToKeep, "PC1"],
                   pca_df.loc[indToKeep, "PC2"],
                   c=color)

In [None]:
pca2 = PCA().fit(x) # Just fit because we don't need to actually map the data to the PCA, which is what transform does
evr = pca2.explained_variance_ratio_
print("\n".join("{:2d}\t{:0.4f}".format(*k) for k in enumerate(evr)))

In [None]:
plt.plot(np.cumsum(pca2.explained_variance_ratio_))
plt.xlabel("Num components")
plt.ylabel("%")
plt