In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
from sklearn import linear_model
from operator import itemgetter



In [2]:
# Read in the dataset
transfusion = pd.read_csv('datasets/transfusion.data')

In [3]:
# Print out the first rows of the dataset
print(transfusion.head())

   Recency (months)  Frequency (times)  Monetary (c.c. blood)  Time (months)  \
0                 2                 50                  12500             98   
1                 0                 13                   3250             28   
2                 1                 16                   4000             35   
3                 2                 20                   5000             45   
4                 1                 24                   6000             77   

   whether he/she donated blood in March 2007  
0                                           1  
1                                           1  
2                                           1  
3                                           1  
4                                           0  


In [4]:
# Print a concise summary of the DataFrame
print(transfusion.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB
None


In [5]:
# Rename the target column as 'target'
transfusion.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True
)

In [6]:
# Print out the first 2 rows after renaming
print(transfusion.head(2))

   Recency (months)  Frequency (times)  Monetary (c.c. blood)  Time (months)  \
0                 2                 50                  12500             98   
1                 0                 13                   3250             28   

   target  
0       1  
1       1  


In [7]:
# Print target incidence proportions, rounding output to 3 decimal places
print(transfusion.target.value_counts(normalize=True).round(3))

0    0.762
1    0.238
Name: target, dtype: float64


In [8]:
# Split the DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    transfusion.drop(columns='target'),
    transfusion.target,
    test_size=0.25,
    random_state=42,
    stratify=transfusion.target
)

In [9]:
# Print out the first 2 rows of X_train
print(X_train.head(2))

     Recency (months)  Frequency (times)  Monetary (c.c. blood)  Time (months)
334                16                  2                    500             16
99                  5                  7                   1750             26


In [10]:
# Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(X_train, y_train)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7423330644124078

Best pipeline: LogisticRegression(RobustScaler(input_matrix), C=25.0, dual=False, penalty=l2)


In [11]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score (TPOT): {tpot_auc_score:.4f}')


AUC score (TPOT): 0.7858


In [12]:
# Print best pipeline steps
print('\nBest pipeline steps:')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline steps:
1. RobustScaler()
2. LogisticRegression(C=25.0, random_state=42)


In [13]:
# Calculate and print X_train's variance, rounding the output to 3 decimal places
print("\nX_train's variance:")
print(X_train.var().round(3))


X_train's variance:
Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64


In [14]:
# Copy X_train and X_test into X_train_normed and X_test_normed
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()

In [15]:
# Specify which column to normalize
col_to_normalize = 'Monetary (c.c. blood)'

In [16]:
# Log normalization
for df_ in [X_train_normed, X_test_normed]:
    # Add log-normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Drop the original column
    df_.drop(columns=col_to_normalize, inplace=True)

In [17]:
# Check the variance for X_train_normed
print("\nX_train_normed's variance:")
print(X_train_normed.var().round(3))


X_train_normed's variance:
Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64


In [18]:
# Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

In [19]:
# Train the model
logreg.fit(X_train_normed, y_train)

In [20]:
# AUC score for Logistic Regression model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score (Logistic Regression): {logreg_auc_score:.4f}')


AUC score (Logistic Regression): 0.7891


In [21]:
# Sort models based on their AUC score from highest to lowest
model_scores = [('TPOT', tpot_auc_score), ('Logistic Regression', logreg_auc_score)]
sorted_models = sorted(model_scores, key=itemgetter(1), reverse=True)

## 11. Conclusion
<p>The demand for blood exhibits variations throughout the year, with notable declines during busy holiday seasons, as evidenced by a prominent example. Accurately predicting future blood supply allows for proactive measures to be taken, potentially saving more lives.</p>
<p>In this analysis, we employed automatic model selection using TPOT, resulting in an AUC score of 0.7850. This outperformed a baseline model that always predicts 0, considering the target incidence, which suggests a 76% success rate. Subsequently, we enhanced our model by log-normalizing the training data, leading to a 0.5% improvement in the AUC score. In machine learning, even marginal accuracy improvements can hold significance depending on the application.</p>
<p>Furthermore, the logistic regression model chosen for this task offers interpretability. This interpretability enables us to assess how much of the variance in the response variable (target) can be explained by the other variables in our dataset. This understanding is crucial for gaining insights into the factors influencing blood donation behavior.</p>

In [22]:
# Print sorted models
print('\nSorted models based on AUC score:')
for model, score in sorted_models:
    print(f'{model}: {score:.4f}')


Sorted models based on AUC score:
Logistic Regression: 0.7891
TPOT: 0.7858
