<a href="https://colab.research.google.com/github/ThemindaSrimal/machine-learning-project/blob/main/BreastCancerPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data collection and analysis


In [4]:
#loading the Breast Cancer dataset to a pandas DataFrame
BreastCancer_dataset = pd.read_csv('/content/dataR2.csv') 

In [7]:
#printing some values to get an overview
BreastCancer_dataset.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


Labels

    Healthy : 1
    Patients : 2

Quantitative Attributes:

    Age (years)
    BMI (kg/m2)
    Glucose (mg/dL)
    Insulin (µU/mL)
    HOMA
    Leptin (ng/mL)
    Adiponectin (µg/mL)
    Resistin (ng/mL)
    MCP-1(pg/dL)



In [8]:
#number of rows and Columns in this dataset
BreastCancer_dataset.shape

(116, 10)

In [9]:
 #getting the statistical measures of the data
BreastCancer_dataset.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,1.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,1.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474282,6.881763,269.97825,1.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,2.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,2.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,2.0


In [10]:
#number of healthy controls and patients (1 healthy, 2 patients)
BreastCancer_dataset['Classification'].value_counts()

2    64
1    52
Name: Classification, dtype: int64

In [11]:
#Analyse healthy vs patients by mean value
BreastCancer_dataset.groupby('Classification').mean()

Unnamed: 0_level_0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
Classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,58.076923,28.317336,88.230769,6.933769,1.552398,26.637933,10.328205,11.614813,499.730692
2,56.671875,26.98474,105.5625,12.513219,3.623342,26.596512,10.061167,17.253777,563.0165


In [12]:
#separating the data and labels
X = BreastCancer_dataset.drop(columns = 'Classification', axis=1)  # axis = 1 (specify column)
Y = BreastCancer_dataset['Classification']

In [13]:
print(X)  # print ds to make sure

     Age        BMI  Glucose  Insulin  ...   Leptin  Adiponectin  Resistin    MCP.1
0     48  23.500000       70    2.707  ...   8.8071     9.702400   7.99585  417.114
1     83  20.690495       92    3.115  ...   8.8438     5.429285   4.06405  468.786
2     82  23.124670       91    4.498  ...  17.9393    22.432040   9.27715  554.697
3     68  21.367521       77    3.226  ...   9.8827     7.169560  12.76600  928.220
4     86  21.111111       92    3.549  ...   6.6994     4.819240  10.57635  773.920
..   ...        ...      ...      ...  ...      ...          ...       ...      ...
111   45  26.850000       92    3.330  ...  54.6800    12.100000  10.96000  268.230
112   62  26.840000      100    4.530  ...  12.4500    21.420000   7.32000  330.160
113   65  32.050000       97    5.730  ...  61.4800    22.540000  10.33000  314.050
114   72  25.590000       82    2.820  ...  24.9600    33.750000   3.27000  392.460
115   86  27.180000      138   19.910  ...  90.2800    14.110000   4.35000  

In [14]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
111    2
112    2
113    2
114    2
115    2
Name: Classification, Length: 116, dtype: int64


Data Standardization

In [15]:
#get standard scalar  z = (x - u) / s  ; u = mean , s = standard deviation (range - to + value depend on data)
scaler = StandardScaler()   #Standardize features by removing the mean and scaling to unit variance

In [16]:
scaler.fit(X)  # Compute the mean and std to be used for later scaling.

StandardScaler(copy=True, with_mean=True, with_std=True)

In [17]:
standardized_data = scaler.transform(X)   # Perform standardization by centering and scaling

In [18]:
print(standardized_data)  # see standardized data

[[-0.57979363 -0.81667527 -1.23922225 ... -0.07022151 -0.54551749
  -0.34125061]
 [ 1.60182096 -1.37875056 -0.25829943 ... -0.69734988 -0.86421418
  -0.1912238 ]
 [ 1.53948912 -0.89176446 -0.30288683 ...  1.79799836 -0.4416602
   0.05821407]
 ...
 [ 0.47984774  0.89385486 -0.03536242 ...  1.81384272 -0.3563202
  -0.64049127]
 [ 0.91617066 -0.39854568 -0.70417344 ...  3.45903808 -0.92857684
  -0.41283214]
 [ 1.7888165  -0.0804471   1.79272102 ...  0.57664406 -0.84103616
  -1.29074683]]


In [19]:
#take standardize data for further processing 
X = standardized_data                 
Y = BreastCancer_dataset['Classification']

In [20]:
print(X)
print(Y)

[[-0.57979363 -0.81667527 -1.23922225 ... -0.07022151 -0.54551749
  -0.34125061]
 [ 1.60182096 -1.37875056 -0.25829943 ... -0.69734988 -0.86421418
  -0.1912238 ]
 [ 1.53948912 -0.89176446 -0.30288683 ...  1.79799836 -0.4416602
   0.05821407]
 ...
 [ 0.47984774  0.89385486 -0.03536242 ...  1.81384272 -0.3563202
  -0.64049127]
 [ 0.91617066 -0.39854568 -0.70417344 ...  3.45903808 -0.92857684
  -0.41283214]
 [ 1.7888165  -0.0804471   1.79272102 ...  0.57664406 -0.84103616
  -1.29074683]]
0      1
1      1
2      1
3      1
4      1
      ..
111    2
112    2
113    2
114    2
115    2
Name: Classification, Length: 116, dtype: int64


Train Test Split


In [22]:
#split ( make sure we split them roughly with equal patients and healthy into both sides )
#Test set size 20%
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(116, 9) (92, 9) (24, 9)


Train the model

In [24]:
#source vector 

classifier = svm.SVC(kernel='linear')

In [25]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Accuracy Score

In [27]:
#accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [28]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.8043478260869565


In [29]:
#accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [30]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7083333333333334


Predictive System

In [34]:
input_data = (45,20.83,74,4.56,0.832,7.76,8.24,28,384)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 1):
  print('The person likely to be healthy')
else:
  print('The person likely to have Brest Cancer')

[[-0.76678917 -1.35084081 -1.06087264 -0.54388816 -0.51374184 -0.98715483
  -0.28484539  1.07594251 -0.43739529]]
[2]
The person likely to have Brest Cancer
