# 1. Analysing data

#### Import the necessary libraries

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn import preprocessing

In [2]:
data=pd.read_csv('../dataset/anemia.csv')

In [3]:
data.tail()
data.head()
len(data)

1421

In [4]:
data.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   int64  
 1   Hemoglobin  1421 non-null   float64
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 66.7 KB


In [6]:
data.isnull().sum() 

Gender        0
Hemoglobin    0
MCH           0
MCHC          0
MCV           0
Result        0
dtype: int64

### Gender dependencies

In [7]:
data.groupby(['Gender','Result'])['Result'].count()

Gender  Result
0       0         473
        1         208
1       0         328
        1         412
Name: Result, dtype: int64

### Continious Feature: Hemoglobin

In [8]:

print('The highest hemoglobin was of:',data['Hemoglobin'].max())
print('The lowest hemoglobin was of:',data['Hemoglobin'].min())
print('The average hemoglobin in the data:',data['Hemoglobin'].mean())

The highest hemoglobin was of: 16.9
The lowest hemoglobin was of: 6.6
The average hemoglobin in the data: 13.412737508796623


### Continious Feature: MCH 

In [9]:

print('The highest MCH was of:',data['MCH'].max())
print('The lowest MCH was of:',data['MCH'].min())
print('The average MCH in the data:',data['MCH'].mean())

The highest MCH was of: 30.0
The lowest MCH was of: 16.0
The average MCH in the data: 22.90562983814215


<div class="alert alert-block alert-info"> 📌 MCH stands for “mean corpuscular hemoglobin.” An MCH value refers to the average quantity of hemoglobin present in a single red blood cell. Hemoglobin is the protein in your red blood cells that transports oxygen to the tissues of your body</div>

In [10]:
# See the min, max, mean values
print('The highest MCHC was of:',data['MCHC'].max())
print('The lowest MCHC was of:',data['MCHC'].min())
print('The average MCHC in the data:',data['MCHC'].mean())

The highest MCHC was of: 32.5
The lowest MCHC was of: 27.8
The average MCHC in the data: 30.251231527093594


<div class="alert alert-block alert-info"> 📌 A similar measure to MCH is something doctors call "mean corpuscular hemoglobin concentration" (MCHC). MCHC checks the average amount of hemoglobin in a group of red blood cells.</div>

### Continious Feature: MCV (Mean Corpuscular Volume)

In [11]:
# See the min, max, mean values
print('The highest MCV was of:',data['MCV'].max())
print('The lowest MCV was of:',data['MCV'].min())
print('The average MCV in the data:',data['MCV'].mean())

The highest MCV was of: 101.6
The lowest MCV was of: 69.4
The average MCV in the data: 85.5237860661506


<div class="alert alert-block alert-info"> 📌 Mean corpuscular volume (MCV) is a laboratory value that measures the average size and volume of a red blood cell. It has utility in helping determine the etiology of anemia — calculation of the value is by multiplying the percent hematocrit by ten divided by the erythrocyte count.</div>

# 2. Data Cleaning

## Training the model:

In [12]:
from sklearn import preprocessing
import pandas as pd

d = preprocessing.normalize(data.iloc[:,1:5], axis=0)
scaled_df = pd.DataFrame(d, columns=["Hemoglobin", "MCH", "MCHC", "MCV"])
scaled_df.head()
scaled_df

Unnamed: 0,Hemoglobin,MCH,MCHC,MCV
0,0.029155,0.025904,0.025491,0.025799
1,0.031112,0.028985,0.024790,0.022193
2,0.017611,0.024535,0.025929,0.021946
3,0.029155,0.018258,0.027506,0.026970
4,0.028764,0.025105,0.024703,0.030669
...,...,...,...,...
1416,0.020741,0.028985,0.024703,0.025552
1417,0.023677,0.032294,0.026630,0.026785
1418,0.025633,0.020198,0.024615,0.024874
1419,0.027981,0.018487,0.025841,0.029344


# 3. Predictive Modelling

In [13]:
from sklearn.linear_model import LogisticRegression 
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix 

#### Splitting the data

In [14]:
train,test=train_test_split(data,test_size=0.2,random_state=0,stratify=data['Result'])
train_X=train[train.columns[:-1]]  #extract every columns except the last one (result)
train_Y=train[train.columns[-1:]] 
test_X=test[test.columns[:-1]]
test_Y=test[test.columns[-1:]]
X=data[data.columns[:-1]]
Y=data['Result']
len(train_X), len(train_Y), len(test_X), len(test_Y)

(1136, 1136, 285, 285)

In [15]:
# train_Y #this is the preserved extracted vallue
test_X

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV
429,1,15.0,21.7,30.6,90.0
755,1,14.9,28.2,31.2,90.8
1078,0,16.2,25.1,29.9,81.6
207,1,11.8,24.3,32.0,80.6
845,1,11.2,27.8,32.2,95.8
...,...,...,...,...,...
458,1,15.4,23.4,32.4,92.9
77,1,13.7,18.7,30.3,69.4
1122,1,10.8,17.5,29.3,95.8
483,1,11.7,25.8,31.3,82.9


#### Logistic Regression

In [16]:
LRmodel = LogisticRegression()
LRmodel.fit(train_X,train_Y)
LogReg=LRmodel.predict(test_X)
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(LogReg,test_Y))

The accuracy of the Logistic Regression is 0.9929824561403509


#### Random Forests

In [17]:
RFmodel=RandomForestClassifier(n_estimators=100)
RFmodel.fit(train_X,train_Y)
RF=RFmodel.predict(test_X)
print('The accuracy of the Random Forests is',metrics.accuracy_score(RF,test_Y))

The accuracy of the Random Forests is 1.0


#### Radial Support Vector Machines(rbf-SVM)

In [18]:
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(train_X,train_Y)
RSVM=model.predict(test_X)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(RSVM,test_Y))

Accuracy for rbf SVM is  0.9859649122807017


#### Linear Support Vector Machine(linear-SVM)

In [19]:
model=svm.SVC(kernel='linear',C=0.1,gamma=0.1)
model.fit(train_X,train_Y)
LSVM=model.predict(test_X)
print('Accuracy for linear SVM is',metrics.accuracy_score(LSVM,test_Y))

Accuracy for linear SVM is 0.9859649122807017


#### Decision Tree

In [20]:
model=DecisionTreeClassifier()
model.fit(train_X,train_Y)
DT=model.predict(test_X)
print('The accuracy of the Decision Tree is',metrics.accuracy_score(DT,test_Y))

The accuracy of the Decision Tree is 1.0


#### K-Nearest Neighbours(KNN)

In [21]:
model=KNeighborsClassifier() 
model.fit(train_X,train_Y)
Knnmodel=model.predict(test_X)
print('The accuracy of the KNN is',metrics.accuracy_score(Knnmodel,test_Y))

The accuracy of the KNN is 0.9017543859649123


## Predicting output on user given input(trial)

In [22]:
data.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [23]:
cdf= data.iloc[: 9]
cdf

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0
5,0,11.6,22.3,30.9,74.5,1
6,1,12.7,19.5,28.9,82.9,1
7,1,12.7,28.5,28.2,92.3,1
8,0,14.1,29.7,30.5,75.2,0


In [24]:
fdf=data[(data['MCH']>23) & (data['Result']==1)]
fdf

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
7,1,12.7,28.5,28.2,92.3,1
22,0,6.9,28.1,32.5,94.6,1
24,1,11.0,26.0,32.2,98.9,1
25,1,11.0,25.2,30.9,83.2,1
34,0,11.9,29.5,32.2,87.8,1
...,...,...,...,...,...,...
1396,1,13.0,26.0,31.4,82.8,1
1404,1,11.6,24.6,28.2,96.7,1
1405,0,11.7,24.4,31.5,99.8,1
1416,0,10.6,25.4,28.2,82.9,1


In [25]:
trial= [[1,9,15,33,44]]


In [26]:
RFmodel.predict(trial)

array([1])

In [27]:
LRmodel.predict(trial)

array([1])

# saving the model

In [28]:
import pickle

In [29]:
with open('model_pklg','wb') as f:
    pickle.dump(LRmodel,f)