In [16]:
import pandas as pd
import numpy as np

In [2]:
#Loading the blood donations data:

transfusion_df = pd.read_csv("transfusion.data", sep = ",")
transfusion_df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [3]:
transfusion_df.shape

(748, 5)

In [4]:
transfusion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [5]:
#Checking whether any null values present in the dataset or not:

transfusion_df.isnull().sum()

Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64

In [6]:
transfusion_df.dtypes

Recency (months)                              int64
Frequency (times)                             int64
Monetary (c.c. blood)                         int64
Time (months)                                 int64
whether he/she donated blood in March 2007    int64
dtype: object

In [7]:
transfusion_df.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [None]:
transfusion_df.iloc[:,4].mean()

#Looks the data is imbalanced, it says 23.8% of people he/she donated blood in March 2007 and 76.2% of the people will not donate. 

### In our dataset has all numerical values and there is no missing values present in it.

### Here we need to predict whether he/she donated blood in March 2007 or not, So this column is our Target Variable

In [11]:
transfusion_df = transfusion_df.rename(columns={"whether he/she donated blood in March 2007":"Target"})

In [12]:
transfusion_df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


# Training data and Testing Data Split:

As out Target column resulted we gonna split the 76% of data for training and remaining 24% is for Testing.

In [18]:
X = transfusion_df.drop("Target", axis=1)
y = transfusion_df["Target"]

In [21]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.24, random_state=0, stratify=y)

In [36]:
print("X_train :",X_train.shape)
print()
print("X_test :", X_test.shape)
print()
print("y_train :", y_train.shape)
print()
print("y_test :", y_test.shape)

X_train : (568, 4)

X_test : (180, 4)

y_train : (568,)

y_test : (180,)


# Let's work with AutomatedMachineLearning Library TPOT

In [48]:
from tpot import TPOTClassifier
from tpot import TPOTRegressor

tpot_cls = TPOTClassifier(generations=5, verbosity=2, max_time_mins=30)
tpot_cls.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8011799410029499

Generation 2 - Current best internal CV score: 0.8011799410029499

Generation 3 - Current best internal CV score: 0.8099052942089736

Generation 4 - Current best internal CV score: 0.8099052942089736

Generation 5 - Current best internal CV score: 0.8099518708275113

Best pipeline: XGBClassifier(PCA(input_matrix, iterated_power=5, svd_solver=randomized), learning_rate=0.001, max_depth=10, min_child_weight=6, n_estimators=100, n_jobs=1, subsample=0.9000000000000001, verbosity=0)


TPOTClassifier(generations=5, max_time_mins=30, verbosity=2)

In [50]:
tpot_cls.score(X_train, y_train)

0.8151408450704225

In [51]:
tpot_Reg = TPOTRegressor(generations=5, verbosity=2, max_time_mins=30)
tpot_Reg.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.15261882935235782

Generation 2 - Current best internal CV score: -0.15261882935235782

Generation 3 - Current best internal CV score: -0.15228867413035266

Generation 4 - Current best internal CV score: -0.1522335481250039

Generation 5 - Current best internal CV score: -0.15160734347196703

Best pipeline: RidgeCV(ElasticNetCV(DecisionTreeRegressor(MaxAbsScaler(ExtraTreesRegressor(input_matrix, bootstrap=True, max_features=0.8, min_samples_leaf=13, min_samples_split=6, n_estimators=100)), max_depth=1, min_samples_leaf=7, min_samples_split=16), l1_ratio=0.45, tol=0.1))


TPOTRegressor(generations=5, max_time_mins=30, verbosity=2)

In [52]:
tpot_Reg.score(X_train, y_train)

-0.13546088462611408