In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("cell_samples.csv")  # importing the data

In [3]:
data.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
data.drop('ID', axis = 1, inplace = True)

In [5]:
data

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Clump        699 non-null    int64 
 1   UnifSize     699 non-null    int64 
 2   UnifShape    699 non-null    int64 
 3   MargAdh      699 non-null    int64 
 4   SingEpiSize  699 non-null    int64 
 5   BareNuc      699 non-null    object
 6   BlandChrom   699 non-null    int64 
 7   NormNucl     699 non-null    int64 
 8   Mit          699 non-null    int64 
 9   Class        699 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


### Converting the 'BareNuc' into 'int' datatype

In [7]:
# data['BareNuc'] = data['BareNuc'].astype('int64')  # error will be shown because of "?" sign present in column 'BareNuc'

In [8]:
# to check error we are using for loop

In [9]:
for i in data['BareNuc']:
    try:
        if int(i):
            continue
    except:
        print(i)

?
?
?
?
?
?
?
?
?
?
?
?
?
?
?
?


In [10]:
data['BareNuc'] = data['BareNuc'].replace("?",0)  # replcing "?" with "0"

In [11]:
for i in data['BareNuc']:
    try:
        if int(i):
            continue
    except:
        print(i)

In [12]:
data['BareNuc'] = data['BareNuc'].astype('int64')  # converting "BareNuc" into 'integer' datatype

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Clump        699 non-null    int64
 1   UnifSize     699 non-null    int64
 2   UnifShape    699 non-null    int64
 3   MargAdh      699 non-null    int64
 4   SingEpiSize  699 non-null    int64
 5   BareNuc      699 non-null    int64
 6   BlandChrom   699 non-null    int64
 7   NormNucl     699 non-null    int64
 8   Mit          699 non-null    int64
 9   Class        699 non-null    int64
dtypes: int64(10)
memory usage: 54.7 KB


### replacing "0" with 'mean' 

In [14]:
mean = data['BareNuc'].mean()
print(mean)
data['BareNuc'] = data['BareNuc'].replace(0,mean)  

3.463519313304721


In [15]:
data

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,2
695,2,1,1,1,2,1.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


In [16]:
data.columns

Index(['Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc',
       'BlandChrom', 'NormNucl', 'Mit', 'Class'],
      dtype='object')

### Training the SVC model 

In [17]:
X = data[['Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc',
       'BlandChrom', 'NormNucl', 'Mit']]
y = data['Class']

x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2)
print(f"x_train shape - {x_train.shape}")
print(f"x_test shape - {x_test.shape}")
svc = SVC()
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(f"Accuracy of SVC model is - {accuracy_score(y_pred,y_test):.2f}")

x_train shape - (559, 9)
x_test shape - (140, 9)
Accuracy of SVC model is - 0.96


In [18]:
# kernels - {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}

### Checking accuracy of model by using differen Kernels 

In [19]:
svc = SVC(kernel = "linear")
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(f"Accuracy of SVC model is - {accuracy_score(y_pred,y_test):.2f}")

Accuracy of SVC model is - 0.95


In [20]:
svc = SVC(kernel = "sigmoid")
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(f"Accuracy of SVC model is - {accuracy_score(y_pred,y_test):.2f}")

Accuracy of SVC model is - 0.43


In [21]:
svc = SVC(kernel = "rbf")
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(f"Accuracy of SVC model is - {accuracy_score(y_pred,y_test):.2f}")

Accuracy of SVC model is - 0.96


In [22]:
svc = SVC(kernel = "poly")
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(f"Accuracy of SVC model is - {accuracy_score(y_pred,y_test):.2f}")

Accuracy of SVC model is - 0.95


In [23]:
svc = SVC(kernel = "rbf")  # most commonly used kernel
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)
print(f"Accuracy of SVC model is - {accuracy_score(y_pred,y_test):.2f}")

Accuracy of SVC model is - 0.96


In [24]:
data

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,2
695,2,1,1,1,2,1.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


In [25]:
predict = svc.predict([[5,4,8,3,7,4,3,1,1]])

# 2 => benign (non-cancerous)
# 4 => malignant (cancerous)

if predict[0] == 2:
    print("benign (non-cancerous)")
else:
    print("malignant (cancerous)")

malignant (cancerous)


