<a href="https://colab.research.google.com/github/SonakshiA/Anemia_Detection/blob/main/Anemia_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opendatasets



In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [None]:
import opendatasets as od

In [None]:
data = 'https://www.kaggle.com/datasets/biswaranjanrao/anemia-dataset'

In [None]:
od.download(data)

Skipping, found downloaded files in "./anemia-dataset" (use force=True to force download)


In [None]:
anemia_dataset = pd.read_csv('anemia.csv')

In [None]:
anemia_dataset.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [None]:
anemia_dataset.shape #shape returns a tuple (<number of rows>,<number of columns>)

(1421, 6)

In [None]:
anemia_dataset.describe()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
count,1421.0,1421.0,1421.0,1421.0,1421.0,1421.0
mean,0.52076,13.412738,22.90563,30.251232,85.523786,0.436312
std,0.499745,1.974546,3.969375,1.400898,9.636701,0.496102
min,0.0,6.6,16.0,27.8,69.4,0.0
25%,0.0,11.7,19.4,29.0,77.3,0.0
50%,1.0,13.2,22.7,30.4,85.3,0.0
75%,1.0,15.0,26.2,31.4,94.2,1.0
max,1.0,16.9,30.0,32.5,101.6,1.0


In [None]:
anemia_dataset['Result'].value_counts()

0    801
1    620
Name: Result, dtype: int64

In [None]:
anemia_dataset.groupby('Result').mean()

Unnamed: 0_level_0,Gender,Hemoglobin,MCH,MCHC,MCV
Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.409488,14.795506,23.005743,30.19201,85.698127
1,0.664516,11.62629,22.77629,30.327742,85.298548


In [None]:
#separating dataset and labels
X = anemia_dataset.drop(columns='Result',axis=1) #0 for rows and 1 for columns
Y = anemia_dataset['Result']

In [None]:
print(X)

      Gender  Hemoglobin   MCH  MCHC   MCV
0          1        14.9  22.7  29.1  83.7
1          0        15.9  25.4  28.3  72.0
2          0         9.0  21.5  29.6  71.2
3          0        14.9  16.0  31.4  87.5
4          1        14.7  22.0  28.2  99.5
...      ...         ...   ...   ...   ...
1416       0        10.6  25.4  28.2  82.9
1417       1        12.1  28.3  30.4  86.9
1418       1        13.1  17.7  28.1  80.7
1419       0        14.3  16.2  29.5  95.2
1420       0        11.8  21.2  28.4  98.1

[1421 rows x 5 columns]


In [None]:
print(Y)

0       0
1       0
2       1
3       0
4       0
       ..
1416    1
1417    1
1418    1
1419    0
1420    1
Name: Result, Length: 1421, dtype: int64


Data Standardization : To bring the values to a common scale/range so that the machine can learn it easily


In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X) # fit() computes the mean and SD for a given feature so that it can be used for further scaling

In [None]:
std_data = scaler.transform(X) # transform() performs the scaling

In [None]:
print(std_data)

[[ 0.95930718  0.75348274 -0.05182232 -0.82207061 -0.18932081]
 [-1.04241896  1.26010665  0.62862494 -1.39333406 -1.40385665]
 [-1.04241896 -2.23559834 -0.35424332 -0.46503096 -1.48690183]
 ...
 [ 0.95930718 -0.1584403  -1.31190984 -1.53614992 -0.50074026]
 [-1.04241896  0.44950839 -1.68993609 -0.53643889  1.00445372]
 [-1.04241896 -0.81705139 -0.42984857 -1.32192613  1.30549252]]


In [None]:
X = std_data
Y = anemia_dataset['Result']

In [None]:
print(X)
print(Y)

[[ 0.95930718  0.75348274 -0.05182232 -0.82207061 -0.18932081]
 [-1.04241896  1.26010665  0.62862494 -1.39333406 -1.40385665]
 [-1.04241896 -2.23559834 -0.35424332 -0.46503096 -1.48690183]
 ...
 [ 0.95930718 -0.1584403  -1.31190984 -1.53614992 -0.50074026]
 [-1.04241896  0.44950839 -1.68993609 -0.53643889  1.00445372]
 [-1.04241896 -0.81705139 -0.42984857 -1.32192613  1.30549252]]
0       0
1       0
2       1
3       0
4       0
       ..
1416    1
1417    1
1418    1
1419    0
1420    1
Name: Result, Length: 1421, dtype: int64


Train-Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2) # test_size: 0.2 -> 20% data for testing; stratify=Y so that not 0s and 1s are equally distributed

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(1421, 5) (1136, 5) (285, 5)


Training the Model

In [None]:
classifier = svm.SVC(kernel='linear') #kernel does the regression here

In [None]:
classifier.fit(X_train,Y_train)

Model Evaluation

Accuracy Score

In [None]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [None]:
print(training_data_accuracy)

0.9911971830985915


In [None]:
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [None]:
print(testing_data_accuracy)

0.9789473684210527


Predictive System

In [None]:
input_data = (0,11.6,22.3,30.9,74.5)

input_data_as_np_array = np.asarray(input_data)

input_data_reshaped = input_data_as_np_array.reshape(1,-1) #-1 used when we don't want to tell the dimension of that axis
print(input_data_reshaped)

stand_data = scaler.transform(input_data_reshaped)
print(stand_data)

prediction = classifier.predict(stand_data)
print(prediction)

if(prediction[0]==1):
  print("Person is anemiac")
else:
  print("Person is not anemiac")

[[ 0.  11.6 22.3 30.9 74.5]]
[[1.20007218e-16 1.16000000e+01 2.23000000e+01 3.09000000e+01
  7.45000000e+01]]
[0]
Person is not anemiac
