In [17]:
import numpy as np
from numpy.linalg import eig
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [2]:
A = np.array([[1, 2], [3, 4], [5, 6]])
print(A)

[[1 2]
 [3 4]
 [5 6]]


# Calculate the mean of each column

In [3]:
M = np.mean(A, axis = 0)
print(M)

[3. 4.]


# Center columns by subtracting column means

In [4]:
C = A- M
print(C)

[[-2. -2.]
 [ 0.  0.]
 [ 2.  2.]]


# calculate covariance matrix of centered matrix

In [5]:
V = np.cov(C.T)
print(V)

[[4. 4.]
 [4. 4.]]


# Eigendecomposition of covariance matrix

In [6]:
values, vectors = eig(V)
print(values)
print(vectors)

[8. 0.]
[[ 0.70710678 -0.70710678]
 [ 0.70710678  0.70710678]]


# Project data

In [7]:
P = vectors.T.dot(C.T)
print(P.T)

[[-2.82842712  0.        ]
 [ 0.          0.        ]
 [ 2.82842712  0.        ]]


In [8]:
from sklearn.decomposition import PCA

In [9]:
pca = PCA(2)
pca.fit(A)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [10]:
print(pca.components_)
print(pca.explained_variance_)

[[ 0.70710678  0.70710678]
 [-0.70710678  0.70710678]]
[8. 0.]


In [11]:
B = pca.transform(A)
print(B)

[[-2.82842712e+00 -2.22044605e-16]
 [ 0.00000000e+00  0.00000000e+00]
 [ 2.82842712e+00  2.22044605e-16]]


# Applying to the new data

In [12]:
import pandas as pd
cancer = pd.read_csv("breast_cancer.csv")
cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [13]:
cancer1 = cancer.drop(['id'], axis = 1)

from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
cancer1['diagnosis'] = labelencoder.fit_transform(cancer1['diagnosis'])

cancer1.dtypes

diagnosis                    int64
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst

In [14]:
pca1 = PCA(4)
pca1.fit(cancer1)

PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [15]:
print(pca1.components_)
print(pca1.explained_variance_)

[[ 5.32491176e-04  5.08623135e-03  2.19657036e-03  3.50763253e-02
   5.16826394e-01  4.23694648e-06  4.05260057e-05  8.19399513e-05
   4.77807747e-05  7.07804557e-06 -2.62155144e-06  3.13742465e-04
  -6.50983758e-05  2.23634120e-03  5.57271577e-02 -8.05645804e-07
   5.51918267e-06  8.87094529e-06  3.27915015e-06 -1.24101786e-06
  -8.54529271e-08  7.15473171e-03  3.06736655e-03  4.94576389e-02
   8.52063272e-01  6.42005767e-06  1.01275948e-04  1.68928633e-04
   7.36658167e-05  1.78986335e-05  1.61356430e-06]
 [-2.20205422e-04  9.28705507e-03 -2.88161661e-03  6.27480686e-02
   8.51823762e-01 -1.48194772e-05 -2.68878874e-06  7.51417358e-05
   4.63500068e-05 -2.52431219e-05 -1.61197311e-05 -5.38692782e-05
   3.48370004e-04  8.19640685e-04  7.51116765e-03  1.49437848e-06
   1.27357585e-05  2.86920519e-05  9.36006123e-06  1.22647345e-05
   2.89680219e-07 -5.68676306e-04 -1.32152785e-02 -1.85987211e-04
  -5.19742244e-01 -7.68566578e-05 -2.56104754e-04 -1.75472256e-04
  -3.05054016e-05 -1.5704

In [16]:
B = pca1.transform(cancer1)
print(B)

[[1160.14274385 -293.91753487   48.57838829   -8.71977365]
 [1269.12259735   15.63019385  -35.39433793   17.86050902]
 [ 995.7940818    39.15672295   -1.7099218     4.19611699]
 ...
 [ 314.50204562   47.55342261  -10.44324763   -9.77775872]
 [1124.85828992   34.12921837  -19.74220047  -23.65584083]
 [-771.52771076  -88.64311299   23.88898053    2.54937566]]


In [19]:
bc_traindata = pd.read_csv('breast_cancer.csv')
bc_traindata.drop(['id'], axis=1,inplace=True)

In [22]:
LE=LabelEncoder()
bc_traindata.iloc[:,0]=LE.fit_transform(bc_traindata['diagnosis'])

In [23]:
#Feature extraction with statistical Tests (CHi-squared)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [28]:
y=bc_traindata['diagnosis']
x=bc_traindata.drop(['diagnosis'],axis=1)

In [29]:
# Load Data
test=SelectKBest(score_func=chi2,k=4)
fit=test.fit(x,y)


In [34]:
#summarize scores
import numpy
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(x)

# summarize selected features
print(features[0:5,:])

[2.661e+02 9.390e+01 2.011e+03 5.399e+04 1.499e-01 5.403e+00 1.971e+01
 1.054e+01 2.574e-01 7.431e-05 3.468e+01 9.794e-03 2.506e+02 8.759e+03
 3.266e-03 6.138e-01 1.045e+00 3.052e-01 8.036e-05 6.371e-03 4.917e+02
 1.744e+02 3.665e+03 1.126e+05 3.974e-01 1.931e+01 3.952e+01 1.349e+01
 1.299e+00 2.315e-01]
[[1001.    153.4   184.6  2019.  ]
 [1326.     74.08  158.8  1956.  ]
 [1203.     94.03  152.5  1709.  ]
 [ 386.1    27.23   98.87  567.7 ]
 [1297.     94.44  152.2  1575.  ]]


In [37]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [38]:
model =LogisticRegression()
rfe=RFE(model,3)
fit=rfe.fit(x,y)
print(rfe.n_features_)
print(rfe.support_)
print(fit.transform(x))

features = fit.transform(x)

3
[False False False False False False  True False False False False False
 False False False False False False False False  True False False False
 False False  True False False False]
[[ 0.3   25.38   0.712]
 [ 0.087 24.99   0.242]
 [ 0.197 23.57   0.45 ]
 ...
 [ 0.093 18.98   0.34 ]
 [ 0.351 25.74   0.939]
 [ 0.     9.456  0.   ]]
