In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

## Reading and analysing my data

In [2]:
cancer = pd.read_csv("../dataset/cancer patient data sets.csv")
print(cancer.shape)
cancer.head()

(1000, 26)


Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [3]:
cancer2 = pd.read_csv("../dataset/cancer.csv")
print(cancer2.shape)
cancer2["Class"].sample(10)

(682, 10)


55     1
445    0
514    0
428    0
260    1
148    1
409    0
601    0
227    1
59     0
Name: Class, dtype: int64

In [4]:
print(cancer2.describe())
print(cancer2["Class"].value_counts())
cancer2.groupby("Class").mean()

            Class         age   menopause  tumor-size   inv-nodes   node-caps  \
count  682.000000  682.000000  682.000000  682.000000  682.000000  682.000000   
mean     0.348974    4.442815    3.143695    3.208211    2.826979    3.233138   
std      0.476995    2.822781    3.061753    2.985140    2.865457    2.224523   
min      0.000000    1.000000    1.000000    1.000000    1.000000    1.000000   
25%      0.000000    2.000000    1.000000    1.000000    1.000000    2.000000   
50%      0.000000    4.000000    1.000000    1.000000    1.000000    2.000000   
75%      1.000000    6.000000    5.000000    5.000000    4.000000    4.000000   
max      1.000000   10.000000   10.000000   10.000000   10.000000   10.000000   

        deg-malig      breast  breast-quad    irradiat  
count  682.000000  682.000000   682.000000  682.000000  
mean     3.542522    3.435484     2.868035    1.604106  
std      3.646104    2.438573     3.054599    1.733792  
min      1.000000    1.000000     1.000000

Unnamed: 0_level_0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2.963964,1.306306,1.414414,1.346847,2.108108,1.346847,2.083333,1.261261,1.065315
1,7.201681,6.571429,6.554622,5.588235,5.331933,7.638655,5.957983,5.865546,2.609244


In [5]:
print(cancer2.isnull().sum())

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64


## Seperating labels

In [6]:
X = cancer2.drop(columns="Class",axis=1)
y = cancer2["Class"]
print(X.shape)
print(y.shape)

(682, 9)
(682,)


## Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=101)
print(X.shape, X_train.shape, X_test.shape)

(682, 9) (477, 9) (205, 9)


### Training model

In [8]:
classifier = svm.SVC(kernel="linear")
classifier.fit(X_train,y_train)

### model evaluation

### accuracy score

In [9]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train) * 100
# training_data_accuracy
print("Accuracy score of the training data : ", training_data_accuracy, "%")

Accuracy score of the training data :  97.69392033542978 %


In [10]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test) * 100
print("Accuracy score of the test data : ", test_data_accuracy, "%")

Accuracy score of the test data :  96.09756097560975 %


### making a prediction

In [11]:
input_data = (5, 1, 1, 1, 2, 1, 3, 1, 1)
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if prediction[0] == 1:
    print("The person has cancer")
else:
    print("The person is does not have cancer")

[0]
The person is does not have cancer




## Saving trained model

In [12]:
import pickle

In [13]:
filename = "../saved models/cancer_model.sav"
pickle.dump(classifier, open(filename, "wb"))

In [14]:
# loading the saved model
loaded_model = pickle.load(open("../saved models/cancer_model.sav", "rb"))

In [15]:
input_data = (5, 1, 2, 1, 2, 1, 1, 1, 1)
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if prediction[0] == 1:
    print("The person has cancer")
else:
    print("The person does not have breast cancer")

[0]
The person does not have breast cancer




In [16]:
for col in X.columns:
  print(col)

age
menopause
tumor-size
inv-nodes
node-caps
deg-malig
breast
breast-quad
irradiat
