In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer as lbc

# STEP #1: IMPORTING DATA

In [None]:
cancers = lbc()

In [None]:
cancers

In [None]:
cancers.keys()

In [None]:
print(cancers['DESCR'])

In [None]:
print(cancers['target'])

In [None]:
cancers['target'].shape

In [None]:
print(cancers['target_names'])

In [None]:
print(cancers['feature_names'])

In [None]:
cancers['data'].shape

In [None]:
 cancers_DF=pd.DataFrame(np.c_[cancers['data'], cancers['target']], columns = np.append(cancers['feature_names'], ['target']))

In [None]:
cancers_DF.head()

In [None]:
cancers_DF.describe()

In [None]:
cancers_DF.tail()

# STEP #2: Visualize DATA

In [None]:
sns.pairplot(cancers_DF, vars=['mean radius','mean texture','mean perimeter','mean area',
 'mean smoothness','mean compactness','mean concavity',
 'mean concave points','mean symmetry','mean fractal dimension'],hue='target')

In [None]:
sns.countplot(cancers_DF['target'])

In [None]:
sns.scatterplot(x='mean area',y='mean smoothness',hue='target',data=cancers_DF)

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(cancers_DF.corr(), annot=True)

# STEP #3: Model Training (Finding a problem solution)

#### splitting dataframe into features and target where x is feature and y is target

In [None]:
x=cancers_DF.drop(['target'],axis=1)

In [None]:
x.head()

In [None]:
y=cancers_DF['target']

In [None]:
y.head

### Create Test and train data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

In [None]:
x_train.head()

In [None]:
x_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
svc_model1=SVC()

In [None]:
svc_model1.fit(x_train,y_train)

# STEP #4: Evaluating Model

In [None]:
y_predict=svc_model1.predict(x_test)

In [None]:
y_predict

>since prediction is not perfect giving all as one we have to improve the model. We can use confusion matrix to check false positive status


![Confusion Matrix](./img/ConfusionMatrix.png)

**A confusion matrix is a summary of prediction results on a classification problem.
The number of correct and incorrect predictions are summarized with count values and broken down by each class. This is the key to the confusion matrix.
The confusion matrix shows the ways in which your classification model is confused when it makes predictions.
It gives us insight not only into the errors being made by a classifier but more importantly the types of errors that are being made.**

From below matrix all are either false positive(Observation is negative, but is predicted positive.) or false negative(Observation is positive, but is predicted negative)

In [None]:
cm=confusion_matrix(y_test,y_predict)

In [None]:
sns.heatmap(cm,annot = True)

```For above problem one of the solution is to do Data Normalization```

one of the step to perform is unity base normalization: Where we want to get all data to be between 0 and 1 
<img src="./img/normal.JPG" alt="Markdown Monster icon" height="600" width="600" style="float: left; margin-right: 10px; s" />

Next optimization is to perform SVM Parameter optimization:
Two key parameters that we can actually optimize during the process which is 'c' parameter and γ(gamma) parameter
#### C parameter: 
Controls trade-off between classifying training points correctly and having a smooth decision boundry
- **Small c(loose)** make cost (penalty) of misclassification low (soft margin).
- **Large C(strict)** make cost of misclassification high (hard margin), forcing the model to explain input data stricter and potentially over fit.
<img src="./img/SVMParamoptimization.JPG" alt="Markdown Monster icon" height="600" width="600" style="float: center; margin-right: 10px; s" />






#### γ parameter: 
Controls how far the influence of a single training set reaches.
- **large γ** close reach (closer data points have high wight). Hence overfitted
- **Small γ** far reach (more generalized solution)


<img src="./img/large-y.JPG" alt="Markdown Monster icon" height="400" width="400" style="float: left; margin-right: 10px; " />
<img src="./img/small-y.JPG" alt="Markdown Monster icon" height="400" width="400" style="float: left; margin-right: 10px;" />

***Technique to optimize c and γ is grid search***

# STEP #5: Improve Model
### First improvement is we perform normalization which is simply scalling the data to 0 and 1 called feature scalling or unity normalization

In [None]:
min_train=x_train.min()

In [None]:
range_train=(x_train-min_train).max()

In [None]:
x_train_scaled= (x_train - min_train)/range_train

In [None]:
x_train_scaled.head()

In [None]:
print("Without Scalling")
sns.scatterplot(x=x_train['mean area'],y=x_train['mean smoothness'], hue=y_train)

In [None]:
print("With Scalling")
sns.scatterplot(x=x_train_scaled['mean area'],y=x_train_scaled['mean smoothness'], hue=y_train)

In [None]:
min_test=x_test.min()

In [None]:
range_test=(x_test - min_test).max()

In [None]:
x_test_scaled=(x_test - min_test)/range_test

In [None]:
x_test_scaled.head()

In [None]:
print("Without Scalling")
sns.scatterplot(x=x_test['mean area'],y=x_test['mean smoothness'], hue=y_test)

In [None]:
print("With Scalling")
sns.scatterplot(x=x_test_scaled['mean area'],y=x_test_scaled['mean smoothness'], hue=y_test)

In [None]:
svc_model2=SVC()

In [None]:
svc_model2.fit(x_train_scaled,y_train)

In [None]:
y_predict2=svc_model2.predict(x_test_scaled)

In [None]:
cm2=confusion_matrix(y_test,y_predict2)

In [None]:
sns.heatmap(cm2, annot=True)

**Summary of classification report**

In [None]:
print(classification_report(y_test,y_predict2))

### Next improvement is to tune SVC(Support vector classifier) to simply tune c and γ parameter

```To get best parameter for c and gamma sklearn provide method that kind of provide all gird search```

In [None]:
paramGrid={'C':[0.1,1,10,100],'gamma':[1,0.1,0.01,.001],'kernel':['rbf']}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid=GridSearchCV(SVC(),paramGrid,refit=True, verbose=4)

In [None]:
grid.fit(x_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid_predict= grid.predict(x_test_scaled)

In [None]:
grid_cm = confusion_matrix(y_test, grid_predict)

In [None]:
sns.heatmap(grid_cm, annot=True)

In [None]:
print(classification_report(y_test,grid_predict))

**Here we have misclassification point which is actually type erro which is perfect **