# Cancer Diagnosis

 __<h3>DATASET DESCRIPTION__
 
 
 Here is a description of the dataset we will be using:

 Breast Cancer Wisconsin (Diagnostic) Database

 Notes
 -----
 Data Set Characteristics:
     :Number of Instances: 569

     :Number of Attributes: 30 numeric, predictive attributes and the class

     :Attribute Information:
         - radius (mean of distances from center to points on the perimeter)
         - texture (standard deviation of gray-scale values)
         - perimeter
         - area
         - smoothness (local variation in radius lengths)
         - compactness (perimeter^2 / area - 1.0)
         - concavity (severity of concave portions of the contour)
         - concave points (number of concave portions of the contour)
         - symmetry
         - fractal dimension ("coastline approximation" - 1)

         The mean, standard error, and "worst" or largest (mean of the three
         largest values) of these features were computed for each image,
         resulting in 30 features.  For instance, field 3 is Mean Radius, field
         13 is Radius SE, field 23 is Worst Radius.

         - class:
                 - WDBC-Malignant
                 - WDBC-Benign

     :Summary Statistics:

     ===================================== ======= ========
                                            Min     Max
     ===================================== ======= ========
     radius (mean):                         6.981   28.11
     texture (mean):                        9.71    39.28
     perimeter (mean):                      43.79   188.5
     area (mean):                           143.5   2501.0
     smoothness (mean):                     0.053   0.163
     compactness (mean):                    0.019   0.345
     concavity (mean):                      0.0     0.427
     concave points (mean):                 0.0     0.201
     symmetry (mean):                       0.106   0.304
     fractal dimension (mean):              0.05    0.097
     radius (standard error):               0.112   2.873
     texture (standard error):              0.36    4.885
     perimeter (standard error):            0.757   21.98
     area (standard error):                 6.802   542.2
     smoothness (standard error):           0.002   0.031
     compactness (standard error):          0.002   0.135
     concavity (standard error):            0.0     0.396
     concave points (standard error):       0.0     0.053
     symmetry (standard error):             0.008   0.079
     fractal dimension (standard error):    0.001   0.03
     radius (worst):                        7.93    36.04
     texture (worst):                       12.02   49.54
     perimeter (worst):                     50.41   251.2
     area (worst):                          185.2   4254.0
     smoothness (worst):                    0.071   0.223
     compactness (worst):                   0.027   1.058
     concavity (worst):                     0.0     1.252
     concave points (worst):                0.0     0.291
     symmetry (worst):                      0.156   0.664
     fractal dimension (worst):             0.055   0.208
     ===================================== ======= ========

     :Missing Attribute Values: None

     :Class Distribution: 212 - Malignant, 357 - Benign

     :Creator:  Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian

     :Donor: Nick Street

     :Date: November, 1995


In [31]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [32]:
# Initialize and create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cancer').getOrCreate()

In [33]:
# Using Spark to read the cancer data set
data = spark.read.csv('Cancer_Data', header=True, inferSchema=True)

In [34]:
# Printing the first row of the dataframe
data.head()

Row(mean radius=0, mean texture=17.99, mean perimeter=10.38, mean area=122.8, mean smoothness=1001.0, mean compactness=0.1184, mean concavity=0.2776, mean concave points=0.3001, mean symmetry=0.1471, mean fractal dimension=0.2419, radius error=0.07871, texture error=1.095, perimeter error=0.9053, area error=8.589, smoothness error=153.4, compactness error=0.006399, concavity error=0.04904, concave points error=0.05373, symmetry error=0.01587, fractal dimension error=0.03003, worst radius=0.006193, worst texture=25.38, worst perimeter=17.33, worst area=184.6, worst smoothness=2019.0, worst compactness=0.1622, worst concavity=0.6656, worst concave points=0.7119, worst symmetry=0.2654, worst fractal dimension=0.4601)

In [35]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- mean radius: integer (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst

##### Setting up PCA

In [36]:
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
from pyspark.ml.linalg import Vectors

Useing VectorAssembler to convert the input columns of the cancer data to a single output column of an array called "features"

In [37]:
len(data.columns)

30

In [38]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [39]:
output = assembler.transform(data)

In [40]:
output.printSchema()

root
 |-- mean radius: integer (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst

Often its a good idea to normalize each feature to have unit standard deviation and/or zero mean, when using PCA. This is essentially a pre-step to PCA, but its not always necessary.

In [41]:
#Using standard scaler to normalize the data

scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=False)

In [42]:
scaler_model = scaler.fit(output)

In [43]:
scaled_data = scaler_model.transform(output)

In [44]:
scaled_data.printSchema()

root
 |-- mean radius: integer (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst

##### PCA

In [45]:
pca = PCA(inputCol='scaled_features', outputCol='pca_features', k=4)

In [46]:
pca_model = pca.fit(scaled_data)

In [47]:
pcaDF = pca_model.transform(scaled_data)

In [48]:
results = pcaDF.select('pca_features')

Using head() to confirm that your output column Array of pcaFeatures whi9ch should have only has 4 principal components

In [49]:
results.head()

Row(pca_features=DenseVector([21.622, 8.5166, -3.7318, -0.4181]))

In [50]:
results.show(4, truncate=False)

+------------------------------------------------------------------------------+
|pca_features                                                                  |
+------------------------------------------------------------------------------+
|[21.62199738236476,8.516595739466684,-3.7318474175794782,-0.4181244970133412] |
|[15.121737034758134,2.697138979042207,-2.3546461829874357,-2.59498897333438]  |
|[18.432585609777654,5.697069543518227,-2.9058070696230303,-3.0552108608152326]|
|[18.95495650289368,16.025442209800573,-5.934803967957989,-4.158068180951641]  |
+------------------------------------------------------------------------------+
only showing top 4 rows



In [None]:
#Closing the spark session
spark.stop()