PRINCIPAL COMPONENT ANALYSIS

In [None]:
#IMPORT LIBRARIES
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression

import pandas as pd

#LOAD BREAST CANCER DATASET
breast = datasets.load_breast_cancer()


In [None]:
#CHECK DATA PROPERTIES AND FUNCTIONS
dir(breast)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [None]:
#VIEW FEW SAMPLE DATA
breast.data[0:2]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02]])

In [None]:
#VIEW FEATURE NAMES
breast.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [None]:
#VIEW TARGET NAMES
breast.target_names

array(['malignant', 'benign'], dtype='<U9')

In [None]:
#VIEW FEW ENCODED NUMERICAL TARGETS
breast.target[0:3]

array([0, 0, 0])

In [None]:
#FORM A PANDAS DATAFRAME WITH BREAST CANCER DATA WITH INDEPENDENT FEATURE VALUES
df = pd.DataFrame(breast.data,columns=breast.feature_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
#ADD DEPENDANT VARIABLE COLUMN (TARGET)
df['target'] = breast.target
df[50:70]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
50,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,0.05888,...,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563,1
51,13.64,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,...,23.19,96.08,656.7,0.1089,0.1582,0.105,0.08586,0.2346,0.08025,1
52,11.94,18.24,75.71,437.6,0.08261,0.04751,0.01972,0.01349,0.1868,0.0611,...,21.33,83.67,527.2,0.1144,0.08906,0.09203,0.06296,0.2785,0.07408,1
53,18.22,18.7,120.3,1033.0,0.1148,0.1485,0.1772,0.106,0.2092,0.0631,...,24.13,135.1,1321.0,0.128,0.2297,0.2623,0.1325,0.3021,0.07987,0
54,15.1,22.02,97.26,712.8,0.09056,0.07081,0.05253,0.03334,0.1616,0.05684,...,31.69,117.7,1030.0,0.1389,0.2057,0.2712,0.153,0.2675,0.07873,0
55,11.52,18.75,73.34,409.0,0.09524,0.05473,0.03036,0.02278,0.192,0.05907,...,22.47,81.81,506.2,0.1249,0.0872,0.09076,0.06316,0.3306,0.07036,1
56,19.21,18.57,125.5,1152.0,0.1053,0.1267,0.1323,0.08994,0.1917,0.05961,...,28.14,170.1,2145.0,0.1624,0.3511,0.3879,0.2091,0.3537,0.08294,0
57,14.71,21.59,95.55,656.9,0.1137,0.1365,0.1293,0.08123,0.2027,0.06758,...,30.7,115.7,985.5,0.1368,0.429,0.3587,0.1834,0.3698,0.1094,0
58,13.05,19.31,82.61,527.2,0.0806,0.03789,0.000692,0.004167,0.1819,0.05501,...,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289,1
59,8.618,11.79,54.34,224.5,0.09752,0.05272,0.02061,0.007799,0.1683,0.07187,...,15.4,59.9,274.9,0.1733,0.1239,0.1168,0.04419,0.322,0.09026,1


In [None]:
#EXTRACT FEATURES FOR TRAINING AND TESTING (x)
x=df.drop(['target'],axis = 'columns')
x=x.values

In [None]:
#EXTRACT TARGET VARIABLES (y)
y=breast.target

In [None]:
#SPLIT DATA INTO TRAINING(0.7) AND TESTING (0.3)
X_train, X_test, y_train, y_test = train_test_split(x, breast.target, test_size=0.3, random_state=100)

MODEL BUILDING

Naive Bayes and Logistic Regression

For the Naive Bayes, I am going to use the  Gaussian Because looking at the number of features and their nature, Gaussion will be best suitable for this task.

In [None]:
#CREATE A NAIVE BAYES MODEL 
model = GaussianNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9239766081871345

In [None]:
model2=LogisticRegression(max_iter=1000000)
model2.fit(X_train,y_train)
model2.score(X_test,y_test)

0.9532163742690059

a) The model score for naive Bayes is **92%** and **95%** for the logistic Regression

In [None]:
X2 = df[['mean texture','mean perimeter','mean smoothness','mean compactness','mean symmetry']]

In [None]:
X2 = X2.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, breast.target, test_size=0.3, random_state=100)

In [None]:
model = GaussianNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9239766081871345

2. Logestic Regression

In [None]:
model2=LogisticRegression(max_iter=1000000)
model2.fit(X_train,y_train)
model2.score(X_test,y_test)

0.8830409356725146

b) Using only 'mean texture','mean perimeter','mean smoothness','mean compactness',and 'mean symmetry', the model score for naive Bayes is **92%** and **88%** for the logistic Regression

Principal Component Analysis (PCA)

In [None]:
#EXTRACT DATA FOR PCA
df_pca=df.drop(['target'],axis = 'columns')

#VIEW DATA SUMMARY
print(df_pca.iloc[:,0:4].describe())

       mean radius  mean texture  mean perimeter    mean area
count   569.000000    569.000000      569.000000   569.000000
mean     14.127292     19.289649       91.969033   654.889104
std       3.524049      4.301036       24.298981   351.914129
min       6.981000      9.710000       43.790000   143.500000
25%      11.700000     16.170000       75.170000   420.300000
50%      13.370000     18.840000       86.240000   551.100000
75%      15.780000     21.800000      104.100000   782.700000
max      28.110000     39.280000      188.500000  2501.000000


In [None]:

#Scale independent variables, or x values
x = StandardScaler().fit_transform(df_pca)

#FORM A PANDAS DATAFRAME
x=pd.DataFrame(x,columns=breast.feature_names)

#VIEW SCALED DATA SUMMARY
print(x.iloc[:,0:4].describe())

        mean radius  mean texture  mean perimeter     mean area
count  5.690000e+02  5.690000e+02    5.690000e+02  5.690000e+02
mean  -3.153111e-15 -6.568462e-15   -6.993039e-16 -8.553985e-16
std    1.000880e+00  1.000880e+00    1.000880e+00  1.000880e+00
min   -2.029648e+00 -2.229249e+00   -1.984504e+00 -1.454443e+00
25%   -6.893853e-01 -7.259631e-01   -6.919555e-01 -6.671955e-01
50%   -2.150816e-01 -1.046362e-01   -2.359800e-01 -2.951869e-01
75%    4.693926e-01  5.841756e-01    4.996769e-01  3.635073e-01
max    3.971288e+00  4.651889e+00    3.976130e+00  5.250529e+00


PCA With n_components = 2

In [None]:
# n_compents = 2
pca = PCA(n_components=2)

#transform and fit pca data
pca_x = pca.fit_transform(x)

#VIEW SHAPE
pca_x.shape

(569, 2)

In [None]:
#VIEW PCA DATA
pca_x

array([[ 9.19283683,  1.94858307],
       [ 2.3878018 , -3.76817174],
       [ 5.73389628, -1.0751738 ],
       ...,
       [ 1.25617928, -1.90229671],
       [10.37479406,  1.67201011],
       [-5.4752433 , -0.67063679]])

In [None]:
#PCA EXPLAINED RATIO
pca.explained_variance_ratio_

array([0.44272026, 0.18971182])

In [None]:
#SPLIT PCA DATA INTO TRAINING(70%) AND TESTING(30%)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(pca_x, y, test_size=0.3, random_state=30)

In [None]:
#EVALUATE AND PREDICT WITH Naive Bayes
model = GaussianNB()
model.fit(X_train_pca, y_train)
model.score(X_test_pca,y_test)

0.9181286549707602

In [None]:
#EVALUATE AND PREDICT WITH Logistic Regression
model2=LogisticRegression(max_iter=1000000)
model2.fit(X_train_pca,y_train)
model2.score( X_test_pca,y_test)

0.9473684210526315

c) Using 2 components for the PCA, Naive Bayes gave a model score of **92%** and logistic Regression attained a score of **95%**

PCA - Maintaining 95% information of the dataset

In [None]:
#Maintaining 95% information of the dataset
pca = PCA(0.95)

pca_x = pca.fit_transform(x)

#VIEW SHAPE
pca_x.shape

(569, 10)

In [None]:
pca.explained_variance_ratio_

array([0.44272026, 0.18971182, 0.09393163, 0.06602135, 0.05495768,
       0.04024522, 0.02250734, 0.01588724, 0.01389649, 0.01168978])

In [None]:
pca.n_components_

10

d) i. The number of components to preserve 95% information of the dataset is  **10**

In [None]:
#SPLIT PCA_95 DATA INTO TRAINING(70%) AND TESTING(30%)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(pca_x, y, test_size=0.3, random_state=30)

In [None]:
#EVALUATE AND PREDICT WITH Naive Bayes
model = GaussianNB()
model.fit(X_train_pca, y_train)
model.score(X_test_pca,y_test)

0.9181286549707602

In [None]:
#EVALUATE AND PREDICT WITH Logistic Regression
model2=LogisticRegression(max_iter=1000000)
model2.fit(X_train_pca,y_train)
model2.score( X_test_pca,y_test)

0.9941520467836257

d) ii. After preserving 95% of components in the dataset, Naive Bayes attained a model score of **92%**, and Logistic regression attained an accuracy score of **99%**