In [21]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
from sklearn import linear_model

In [22]:
# Load dataset
transfusion = pd.read_csv('datasets/transfusion.data')

In [23]:
# Inspect dataset
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [24]:
# Check dataset structure, missing data, and data types
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [25]:
# Rename target column 
transfusion.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True
)

In [26]:
# Check proportion of each label in target
transfusion['target'].value_counts(normalize=True).round(3)

target
0    0.762
1    0.238
Name: proportion, dtype: float64

In [27]:
# Split dataset into train and test sets, stratifying on the target column considering class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    transfusion.drop(columns='target'),
    transfusion['target'],
    test_size=0.25,
    random_state=42,
    stratify= transfusion['target']
)

In [28]:
# Instantiate and train TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)

tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7423330644124079

Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l2)


In [29]:
# Print AUC score for tpot TPOTClassifier
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7853


In [30]:
# Print best pipeline
print('\nBest pipeline:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline:
1. LogisticRegression(C=0.1, random_state=42)


In [31]:
# Check variance of features to identify potential biases from features with high variance
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [32]:
# variance of feature 'Monetary (c.c. blood)' is very high compared to other features. We correct this using log normalization.

# Copy X_train and X_test into X_train_normed and X_test_normed
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()

# Log normalization
for df in [X_train_normed, X_test_normed]:
    df['monetary_log'] = np.log(df['Monetary (c.c. blood)'])
    df.drop(columns='Monetary (c.c. blood)', inplace=True)

In [33]:
# Check the variance for X_train_normed
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

In [34]:
# Instantiate and train LogisticRegression model
logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

logreg.fit(X_train_normed, y_train)

In [35]:
# Print AUC score LogisticRegression model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891
