# Model and Library imports
### The required libraries (numpy and pandas) and the svm model (from sklearn) are imported as follows :

In [37]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split


# Data extraction
### The dataset is read using pandas.

In [38]:
df = pd.read_csv('bank_notes.csv')
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,Target
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


### Columns such as variance and skewness (of the bank note) are attributes used in digital image processing. The final column 'Target'  is what we want to predict using our model. Target can only have two values: 0 (non-counterfeit) or 1 (counterfeit). 


# Preprocessing Data

### The raw dataset is incomplete as it contains null values which have to be removed before passing it through the model. The following code replaces the '?' with a very large negative value to preserve the accuracy of our model.

In [39]:
df.replace('?',-9999,inplace=True)
df


Unnamed: 0,variance,skewness,curtosis,entropy,Target
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0
...,...,...,...,...,...
1367,0.40614,1.3492,-1.4501,-0.55949,1
1368,-1.3887,-4.8773,6.4774,0.34179,1
1369,-3.7503,-13.4586,17.5932,-2.7771,1
1370,-3.5637,-8.3827,12.393,-1.2823,1


### We want to predict the values for 'Target' which becomes the dependent variable (y). The other attributes become the independent variable (x).

In [49]:
X = np.array(df.drop(['Target'],1))
y = np.array(df['Target'])

### Around 70 - 80 % of the data is used to train the model and the remaining data is used to test the accuracy of our model. The data is split using the train_test_split method from sklearn.

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


### StandardScaler is used to transform the data so that its distribution will have a mean value 0 and standard deviation of 1.

In [51]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_train_transformed

array([[0.01815715, 0.04590906, 0.02822632, 0.01877391],
       [0.04103271, 0.01045076, 0.04819029, 0.03775044],
       [0.03298887, 0.02551519, 0.02830334, 0.0373535 ],
       ...,
       [0.03392705, 0.03483365, 0.04103555, 0.03766623],
       [0.03126439, 0.01406137, 0.03330149, 0.03881293],
       [0.020844  , 0.00403753, 0.0632009 , 0.03419813]])

In [52]:
X_test_transformed = scaler.transform(X_test)

# Training the Data

### We define our SVM model and then train it. In this model the kernel is set to rbf. The gamma value used is 20 and the c parameter is set to 15.

In [53]:
clf = svm.SVC(C=15,gamma=20,kernel='rbf')

In [54]:
clf.fit(X_train_transformed, y_train)

SVC(C=15, gamma=20)

### We have defined another SVM model as well which uses the linear kernel.

In [55]:
clf2 = svm.SVC(C=15,gamma=20,kernel='linear')
clf2.fit(X_train_transformed, y_train)

SVC(C=15, gamma=20, kernel='linear')

# Evaluation and accuracy

### Using the testing data we have defined above we score the accuracy of our models and store it as confidence.

In [56]:
confidence = clf.score(X_test_transformed, y_test)
confidence

0.9781553398058253

In [57]:
confidence2 = clf2.score(X_test_transformed, y_test)
confidence2

0.9393203883495146

### Both models give an accuracy of above 95% which means that our model is pretty good.