In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
# Print out the first 5 lines from the transfusion.data file
!head -n 2 /content/transfusion.data

Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007"
2 ,50,12500,98 ,1


In [5]:
#It is a csv file.
data = pd.read_csv("transfusion.data")
data.rename(columns = {"whether he/she donated blood in March 2007":"Target"},
            inplace = True)
data.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Recency (months)       748 non-null    int64
 1   Frequency (times)      748 non-null    int64
 2   Monetary (c.c. blood)  748 non-null    int64
 3   Time (months)          748 non-null    int64
 4   Target                 748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [7]:
#Target Predictions:
round(data.Target.value_counts(normalize = True),3)

0    0.762
1    0.238
Name: Target, dtype: float64

#Training and testing:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = "Target"), data.Target, test_size = 0.25, random_state = 50, stratify = data.Target)

In [9]:
X_train.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
705,16,1,250,16
2,1,16,4000,35
82,2,4,1000,16
363,21,13,3250,57
560,4,11,2750,64


In [10]:
y_train.head()

705    0
2      1
82     0
363    0
560    1
Name: Target, dtype: int64

#TPOT Classifier:

In [13]:
#Import TPOTClassifier and roc_auc_score:
!pip install tpot
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score


# Instantiate TPOTClassifier:
#Here, a TPOTClassifier object is created with specific configuration parameters:

#generations: Number of iterations to run the optimization process.
#population_size: Number of individuals (candidate pipelines) in each generation.
#verbosity: Level of detail in the output (2 provides more detailed information).
#scoring: The scoring metric to be optimized (here, it's ROC AUC).
#random_state: Seed for reproducibility.
#disable_update_check: Disables the check for updates during optimization.
#config_dict: Specifies the configuration dictionary for TPOT; in this case, it's set to 'TPOT light' for faster optimization.

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring="roc_auc",
    random_state=42,
    disable_update_check=True,
    config_dict="TPOT light"
)

tpot.fit(X_train, y_train)

Collecting tpot
  Downloading TPOT-0.12.1-py3-none-any.whl (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/87.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected 

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7441203167810555

Generation 2 - Current best internal CV score: 0.7441203167810555

Generation 3 - Current best internal CV score: 0.7441203167810555

Generation 4 - Current best internal CV score: 0.7441203167810555

Generation 5 - Current best internal CV score: 0.7453398926654741

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=25.0, dual=False, penalty=l2)


In [14]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7924




In [15]:
# Print best pipeline steps:
print("\nBest pipeline steps:", end="\n")
for index, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f"{index}. {transform}")



Best pipeline steps:
1. StackingEstimator(estimator=GaussianNB())
2. LogisticRegression(C=25.0, random_state=42)


In [16]:
#X_train's variance, rounding the output to 3 decimal places:
round(X_train.var(),3)

Recency (months)              61.656
Frequency (times)             35.962
Monetary (c.c. blood)    2247613.875
Time (months)                596.196
dtype: float64

In [17]:
#Values have to be normalized since "Monetary" variance is very high as compared to the other values:
#Copy X_train and X_test into X_train_norm and X_test_norm:
X_train_norm, X_test_norm = X_train.copy(), X_test.copy()

# Specify which column to normalize:
col_to_normalize = "Monetary (c.c. blood)"

# Log normalization:
for df_ in [X_train_norm, X_test_norm]:
    # Add log normalized column:
    df_["monetary_log"] = np.log1p(df_[col_to_normalize])
    # Drop the original column:
    df_.drop(columns=col_to_normalize, inplace=True)

# Check the variance for X_train_normed
variance_info = X_train_norm.var().round(3)
# Print the variance information:
print(f"\nVariance after normalization:\n{variance_info}")


Variance after normalization:
Recency (months)      61.656
Frequency (times)     35.962
Time (months)        596.196
monetary_log           0.834
dtype: float64


In [18]:
# Import necessary modules:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Instantiate Logistic Regression:
lr = LogisticRegression(solver="liblinear", random_state=42)

# Train the model:
lr.fit(X_train_norm, y_train)

# Predict probabilities and calculate AUC score:
lr_probabilities = lr.predict_proba(X_test_norm)[:, 1]
lr_auc_score = roc_auc_score(y_test, lr_probabilities)

# Print AUC score:
print(f"AUC score for Logistic Regression: {lr_auc_score:.4f}")

# Print coefficients and intercept:
print("\nLogistic Regression Coefficients:")
for feature, coef in zip(X_train_norm.columns, lr.coef_[0]):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {lr.intercept_[0]:.4f}")


AUC score for Logistic Regression: 0.8004

Logistic Regression Coefficients:
Recency (months): -0.0901
Frequency (times): 0.1056
Time (months): -0.0240
monetary_log: 0.2001
Intercept: -1.6419


In [19]:
# Sort models based on their AUC score from highest to lowest
from operator import itemgetter
sorted(
    [("tpot", tpot_auc_score), ("logreg", lr_auc_score)],
    key=itemgetter(1),
    reverse=True
)


[('logreg', 0.800381436745073), ('tpot', 0.7924348378893834)]