# From data pre-processing to the use of algorithms

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
import random
random.seed(42)
np.random.seed(42)

In [2]:
def calculate_accuracy(y, y_pred):
    res = y == y_pred
    return np.mean(res)

In [8]:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=[data.feature_names])
df['target'] = pd.Series(data=data.target,index=df.index )
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## Data pre-processing

### How much data rows and columns do we have?

In [4]:
rows, cols = df.shape
cols -= 1  # because we added target as column above (which is not an original column)
print(f'Rows: {rows}; Columns: {cols}')

Rows: 569; Columns: 30


### How much malignent data rows and benign data rows do we have? (target column)

In [5]:
df['target'].value_counts()

(target,)
1            357
0            212
dtype: int64

### Check for null values and decide on what you are doing with them

In [9]:
df.isnull().values.any()  # no null values - nothing to do here

False

### What are the data types of the columns, do we need to one-hot encode anything here?

In [6]:
df.dtypes # no categorical value - nothing to do here

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

### Are there values that need to be normalized? If so, then normalize them before putting them into the algorithms

In [8]:
not_normalized_columns = []
for col in df:
    if col[0] == 'target':
        continue
    col_entries = df[col]
    # we simply span all values between 0 and 1
    _min, _max = col_entries.min(), col_entries.max()
    df[col] = (col_entries - _min) / (_max - _min)

### Feature selection

In [9]:
# We have 30 dimensions in this dataset which is a lot!
# Lets use a basic random forest algorithm for sklearn and only use the most influential features!
# the random forest implementation has an 'feature_importances_' attribute that you can call
# use only the top 10 features

In [10]:
# data X with labels y
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X = X.to_numpy()
y = y.to_numpy()

# split them into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

In [11]:
clf = RandomForestClassifier(n_estimators=3, random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=3, random_state=42)

In [12]:
feature_importances = clf.feature_importances_
important_features = np.argsort(feature_importances)[-20:]
X_train = X_train[:,important_features]
X_test = X_test[:,important_features]

## Algorithms

### k-NN

In [13]:
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(X_train, y_train)

y_pred = clf_knn.predict(X_test)
print('k-NN accuracy: ', calculate_accuracy(y_test, y_pred))

k-NN accuracy:  0.9649122807017544


### Logistic Regression

In [14]:
clf_lr = LogisticRegression(penalty='l2')
clf_lr.fit(X_train, y_train)

y_pred = clf_lr.predict(X_test)
print('Logistic regression accuracy:', calculate_accuracy(y_test, y_pred))

Logistic regression accuracy: 0.9649122807017544


### Support Vector Machine

In [15]:
clf_svm = svm.SVC(C = 10)
clf_svm.fit(X_train, y_train)

y_pred = clf_svm.predict(X_test)
print('SVM accuracy:', calculate_accuracy(y_test, y_pred))

SVM accuracy: 0.9736842105263158


### Random Forest Classifier

In [16]:
clf_RFC = RandomForestClassifier(n_estimators=10, random_state=42)
clf_RFC.fit(X_train, y_train)

y_pred = clf_RFC.predict(X_test)
print('Random Forest Classifier Accuracy:', calculate_accuracy(y_test, y_pred))

Random Forest Classifier Accuracy: 0.9649122807017544


## What happens if you use the following dataset?

In [17]:
def create_dataset():
    # Create dataset
    np.random.seed(42)  # always the same data generation
    X = np.stack((np.random.normal(size=100),np.random.normal(size=100)),axis = 1)
    y = np.array([1 if (xy[0]**2+xy[1]**2)**0.5 <0.5 else 0 for xy in X])
    
    return X, y

In [18]:
X, y = create_dataset()
# split them into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

### k-NN

In [19]:
clf_knn = KNeighborsClassifier(n_neighbors=20)
clf_knn.fit(X_train, y_train)

y_pred = clf_knn.predict(X_test)
print('k-NN accuracy: ', calculate_accuracy(y_test, y_pred))

k-NN accuracy:  1.0


### Logistic regression

In [20]:
clf_lr = LogisticRegression(penalty='l2')
clf_lr.fit(X_train, y_train)

y_pred = clf_lr.predict(X_test)
print('Logistic regression accuracy:', calculate_accuracy(y_test, y_pred))

Logistic regression accuracy: 0.85


### Support Vector Machine

In [21]:
clf_svm = svm.SVC(C = 1000)
clf_svm.fit(X_train, y_train)

y_pred = clf_svm.predict(X_test)
print('SVM accuracy:', calculate_accuracy(y_test, y_pred))

SVM accuracy: 1.0


### Random Forest Classifier

In [23]:
clf_RFC = RandomForestClassifier(n_estimators=10, random_state=42)
clf_RFC.fit(X_train, y_train)

y_pred = clf_RFC.predict(X_test)
print('Random Forest Classifier Accuracy:', calculate_accuracy(y_test, y_pred))

Random Forest Classifier Accuracy: 1.0
