#Predict Blood Donations

Task will be to predict if a blood donor will donate within a giventime window.

### Task 1 : Print out the first 5 lines from datasets using the head shell command

In [1]:
!head -n 5 transfusion.data


'head' is not recognized as an internal or external command,
operable program or batch file.


### Task 2 : Load the dataset

We now know that we are working with a typical CSV file (i.e., the delimiter is ,, etc.). We proceed to loading the data into memory.


In [2]:
#importing pandas library 

import pandas as pd

In [3]:
#loading transfusion.data dataset to variable transfusion using pandas 

transfusion = pd.read_csv("transfusion.data")

In [4]:
#displaying top 5 rows of transfusion dataset

transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [5]:
#displaying last 5 rows of transfusion dataset

transfusion.tail()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0
747,72,1,250,72,0


In [6]:
#to display number of rows and columns

transfusion.shape

(748, 5)

In [7]:
#checking for null values in dataset

transfusion.isnull().sum()

Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64

### Task 3 : print summary of dataset

In [8]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
Recency (months)                              748 non-null int64
Frequency (times)                             748 non-null int64
Monetary (c.c. blood)                         748 non-null int64
Time (months)                                 748 non-null int64
whether he/she donated blood in March 2007    748 non-null int64
dtypes: int64(5)
memory usage: 29.3 KB


In [9]:
transfusion.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


### Task 4 : Renaming column  whether he/she donated blood in March 2007 to target 

In [10]:
transfusion.rename(columns = {'whether he/she donated blood in March 2007' : 'Target'}, inplace = True)

In [11]:
transfusion.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Target
0,2,50,12500,98,1
1,0,13,3250,28,1


### Task 5 : Print target incidence

In [12]:
transfusion['Target'].value_counts(normalize=True)

0    0.762032
1    0.237968
Name: Target, dtype: float64

We understand that the data is imblanced. 

### Task 6 : Spliting datasets

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    transfusion.drop(columns='Target'),
    transfusion.Target,
    test_size=0.25,
    random_state=42,
    stratify=transfusion.Target
)

In [15]:
x_train.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26


In [16]:
print("Number of rows and columns in x_train is : ", x_train.shape)
print("Number of rows and columns in x_test is : ", x_test.shape)
print("Number of rows and columns in y_train is : ", y_train.shape)
print("Number of rows and columns in y_test is : ", y_test.shape)

Number of rows and columns in x_train is :  (561, 4)
Number of rows and columns in x_test is :  (187, 4)
Number of rows and columns in y_train is :  (561,)
Number of rows and columns in y_test is :  (187,)


### Task 7 : Selecting model using TPOT

TPOT uses a tree-based structure to represent a model pipeline for a predictive modeling problem, including data preparation and modeling algorithms and model hyperparameters.

In [17]:
# Import TPOTClassifier and roc_auc_score
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score



In [18]:
# Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(x_train, y_train)

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=120.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5, population_size=20, random_state=42,
               scoring='roc_auc', verbosity=2)

In [19]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(x_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7637


In [20]:
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline steps:
1. Normalizer()
2. MultinomialNB(alpha=0.001)


### Task 8 : Checking the variance

In [21]:
# X_train's variance, rounding the output to 3 decimal places

x_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

### Task 9 : Normalization

In [22]:
# Copy X_train and X_test into X_train_normed and X_test_normed

x_train_normed = x_train.copy()
x_test_normed = x_test.copy()

In [23]:
# Column to be normalized

col_to_normalize = 'Monetary (c.c. blood)'

In [24]:
#import numpy package

import numpy as np

In [25]:
# Log normalization

for df_ in [x_train_normed, x_train_normed]:
    df_['monetary_log'] = np.log(df_[col_to_normalize])

In [26]:
df_.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'monetary_log'],
      dtype='object')

In [27]:
df_.drop(['Monetary (c.c. blood)'],axis = 1, inplace = True)

In [28]:
df_.columns

Index(['Recency (months)', 'Frequency (times)', 'Time (months)',
       'monetary_log'],
      dtype='object')

In [29]:
# Check the variance for X_train_normed
x_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

### Task 5 : Logistic Regression

In [30]:
from sklearn import linear_model

In [31]:
# Instantiate LogisticRegression
logreg =  linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

In [32]:
x_train_normed.shape

(561, 4)

In [33]:
y_train.shape

(561,)

In [34]:
# Train the model
logreg.fit(x_train_normed, y_train)

LogisticRegression(random_state=42, solver='liblinear')

In [35]:
# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(x_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.3239


### Task 11 : Conclusion

In [36]:
# Importing itemgetter
from operator import itemgetter



In [37]:
# Sort models based on their AUC score from highest to lowest
sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
    key=itemgetter(1), reverse = True
)

[('tpot', 0.7637476160203432), ('logreg', 0.32390336935791475)]