Project Description

Forecasting blood supply is a serious and recurrent problem for blood collection managers.In this Project, you will work with data collected from the donor database of Blood TransfusionService Center.  The dataset, obtained from the Machine Learning Repository, consists of arandom sample of 748 donors. Your task will be to predict if a blood donor will donate within a giventime window. You will look at the full model-building process: from inspecting the dataset to usingthe tpot library to automate your Machine Learning pipeline.To complete this Project, you need to know some Python, pandas, and logistic regression.


In [1]:
#Importing the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing the data

transfusion = pd.read_csv('transfusion.csv')

In [3]:
#To view the data

transfusion.head(5)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [4]:
# To find dataset structure

transfusion.shape

(748, 5)

In [5]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [6]:
# to rename 'whether he/she donated blood in March 2007' to 'target for brevity'

transfusion.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'whether he/she donated blood in March 2007'],
      dtype='object')

In [7]:
transfusion.rename(columns={ 'whether he/she donated blood in March 2007' : 'target'}, inplace = True)

In [8]:
transfusion.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'target'],
      dtype='object')

In [9]:
transfusion.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1


In [10]:
# To print target incidence

transfusion['target'].value_counts(normalize=True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

In [11]:
#Split the transfusion DataFrame into train and test datasets.

X = transfusion.iloc[:,:4]

In [12]:
X.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
0,2,50,12500,98
1,0,13,3250,28


In [13]:
X.shape

(748, 4)

In [14]:
y = transfusion.iloc[:,-1]

In [15]:
y.head(2)

0    1
1    1
Name: target, dtype: int64

In [16]:
y.shape

(748,)

In [17]:
#to import train_test_split module from sklearn

from sklearn.model_selection import train_test_split

X_train, X_test , y_train, y_test = train_test_split(X,y , test_size = 0.25,random_state=42,stratify =y )

In [18]:
print(X_train.shape)
print("shape of X_test", X_test.shape)
print(y_train.shape)
print("shape of y_test", y_test.shape)

(561, 4)
shape of X_test (187, 4)
(561,)
shape of y_test (187,)


In [19]:
# Use the TPOT library to find the best Machine Learning pipeline

from tpot import TPOTClassifier

In [20]:
from sklearn.metrics import roc_auc_score

In [21]:
#To create an instance of TPOTClassifier and assign it to tpot variable (instantiating)

tpot = TPOTClassifier(generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light')

tpot.fit(X_train, y_train)


HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)


TPOTClassifier(config_dict='TPOT light', crossover_rate=0.1, cv=5,
               disable_update_check=True, early_stop=None, generations=5,
               log_file=None, max_eval_time_mins=5, max_time_mins=None,
               memory=None, mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=20,
               random_state=42, scoring='roc_auc', subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [22]:
#AUC score for tpot model

tpot_auc_score = roc_auc_score(y_test,tpot.predict_proba(X_test)[:,1])

In [23]:
print('tpot AUC score: ', tpot_auc_score.round(3))

tpot AUC score:  0.764


In [24]:
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}.{transform}',idx,transform)


Best pipeline steps:
1.Normalizer(copy=True, norm='l2') 1 Normalizer(copy=True, norm='l2')
2.MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True) 2 MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)


In [25]:
tpot.fitted_pipeline_

Pipeline(memory=None,
         steps=[('normalizer', Normalizer(copy=True, norm='l2')),
                ('multinomialnb',
                 MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))],
         verbose=False)

In [26]:
#To print X_train's variance

X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [27]:
pd.DataFrame.var(transfusion).round(3)

Recency (months)              65.535
Frequency (times)             34.098
Monetary (c.c. blood)    2131094.230
Time (months)                594.224
target                         0.182
dtype: float64

In [28]:
# To copy X_train and X_test into X_train_normed and X_test_normed

X_train_normed = X_train.copy()

In [29]:
X_test_normed = X_test.copy()

In [30]:
# To assign the column name (a string) that has the highest varianceto col_to_normalize variable

col_to_normalize = X['Monetary (c.c. blood)']

In [31]:
# To Log normalize col_to_normalize to add it to the DataFrame and to Drop col_to_normalize

for df_ in [X_train_normed, X_test_normed]:
    # Add log normalized column
    df_['monetary_log'] = np.log(df_['Monetary (c.c. blood)'])
    # Drop the original column
    df_.drop(columns= 'Monetary (c.c. blood)', inplace=True)

In [32]:
# To print X_train_normed variance

X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

In [33]:
# Train the logistic regression model.
# Import linear_model from sklearn.


from sklearn.linear_model import LogisticRegression

In [34]:
# To create an instance of linear_model.LogisticRegression and assign it to logreg variable.

logreg = LogisticRegression(solver='liblinear',
                            random_state=42)

# To Train logreg model using the fit() method

logreg.fit(X_train_normed, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
#To find AUC score

logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print('logreg AUC score:',logreg_auc_score.round(3))

logreg AUC score: 0.789


In [36]:
#Import itemgetter from operator module

from operator import itemgetter

In [37]:
#To sort the list of (model_name, model_score) pairs from highest to lowestusing reverse=True parameter

sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
    key=itemgetter(1),
    reverse=True)

[('logreg', 0.7890972663699937), ('tpot', 0.7637476160203432)]