In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from genetic_selection import GeneticSelectionCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np

In [2]:
breast_cancer_data = load_breast_cancer()
# 30 features, 2 classes

In [3]:
# Preprocessing
data_df = pd.DataFrame(data = breast_cancer_data.data,columns = breast_cancer_data.feature_names)
data_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 569 entries, 0 to 568

Data columns (total 30 columns):

 #   Column                   Non-Null Count  Dtype  

---  ------                   --------------  -----  

 0   mean radius              569 non-null    float64

 1   mean texture             569 non-null    float64

 2   mean perimeter           569 non-null    float64

 3   mean area                569 non-null    float64

 4   mean smoothness          569 non-null    float64

 5   mean compactness         569 non-null    float64

 6   mean concavity           569 non-null    float64

 7   mean concave points      569 non-null    float64

 8   mean symmetry            569 non-null    float64

 9   mean fractal dimension   569 non-null    float64

 10  radius error             569 non-null    float64

 11  texture error            569 non-null    float64

 12  perimeter error          569 non-null    float64

 13  area error               569 non-null    float64

 14  smoothn

In [5]:
data_df.corr()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean radius,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,-0.311631,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
mean texture,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,-0.076437,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
mean perimeter,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,-0.261477,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
mean area,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,-0.28311,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
mean smoothness,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,0.584792,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
mean compactness,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,0.565369,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
mean concavity,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,0.336783,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
mean concave points,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,0.166917,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
mean symmetry,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,0.479921,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413
mean fractal dimension,-0.311631,-0.076437,-0.261477,-0.28311,0.584792,0.565369,0.336783,0.166917,0.479921,1.0,...,-0.253691,-0.051269,-0.205151,-0.231854,0.504942,0.458798,0.346234,0.175325,0.334019,0.767297


In [6]:
# Extracting the labels
y = breast_cancer_data.target
# Extracting the values
x = breast_cancer_data.data

In [7]:
# Applying PCA & KNN

# splitting train & test data
x_train, x_test, y_train, y_test = train_test_split(x, y)
# scaling 
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

pca = PCA(0.95)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

print(x_train.shape)
print(x_test.shape)

model = KNeighborsClassifier(n_neighbors=3) # n_neighbors=5 , n_neighbors=2
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('classification_report: ')
print(classification_report(y_test, y_pred))

(426, 10)

(143, 10)

classification_report: 

              precision    recall  f1-score   support



           0       1.00      0.94      0.97        65

           1       0.95      1.00      0.97        78



    accuracy                           0.97       143

   macro avg       0.98      0.97      0.97       143

weighted avg       0.97      0.97      0.97       143





  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [10]:
# checking other distance metrics : euclidean distance

x_train, x_test, y_train, y_test = train_test_split(x, y)
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('classification_report: ')
print(classification_report(y_test, y_pred))

classification_report: 

              precision    recall  f1-score   support



           0       0.94      0.98      0.96        48

           1       0.99      0.97      0.98        95



    accuracy                           0.97       143

   macro avg       0.96      0.97      0.97       143

weighted avg       0.97      0.97      0.97       143





  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [11]:
# Applying DecisionTreeClassifier & LDA


x_train, x_test, y_train, y_test = train_test_split(x, y)
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

lda = LinearDiscriminantAnalysis()
x_train = lda.fit_transform(x_train, y_train)
x_test = lda.transform(x_test)

print(x_train.shape)
print(x_test.shape)

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('classification_report: ')
print(classification_report(y_test, y_pred))

(426, 1)

(143, 1)

classification_report: 

              precision    recall  f1-score   support



           0       0.89      1.00      0.94        50

           1       1.00      0.94      0.97        93



    accuracy                           0.96       143

   macro avg       0.95      0.97      0.96       143

weighted avg       0.96      0.96      0.96       143




In [12]:
# checking genetics algorithm & SVM

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

estimators = LogisticRegression(solver='liblinear', multi_class='ovr')
ga = GeneticSelectionCV(estimators,
                        cv=5,
                        verbose=0,
                        scoring="accuracy",
                        max_features=6,
                        n_population=60,
                        crossover_proba=0.6,
                        mutation_proba=0.2,
                        n_generations=50,
                        crossover_independent_proba=0.6,
                        mutation_independent_proba=0.06,
                        tournament_size=4,
                        n_gen_no_change=20,
                        caching=True,
                        n_jobs=-1)

x_train = ga.fit_transform(x_train, y_train)
x_test = ga.transform(x_test)

print(x_train.shape)
print(x_test.shape)

model = SVC(decision_function_shape='ovo',  kernel='poly') # 'ovr'  , 'rbf'
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('classification_report: ')
print(classification_report(y_test, y_pred))

(398, 6)

(171, 6)

classification_report: 

              precision    recall  f1-score   support



           0       0.98      0.92      0.95        60

           1       0.96      0.99      0.97       111



    accuracy                           0.96       171

   macro avg       0.97      0.95      0.96       171

weighted avg       0.97      0.96      0.96       171




ترکیب الگوریتم های کاهش بعد و مدل های مختلف مثلا استفاده از   
LDA
در کنار 
Dicision Tree
صرفا برای تمرین کدنویسی بوده و به معنای درست بودن استفاده کردن از این دو در کنار یکدیگر نمی باشد
تحلیل این موضوع که کدام الگوریتم کاهش بعد در کنار کدام مدل دقت بهتری را ارائه میدهد در یک پی دی اف همراه با این فایل ارسال شده است