In [99]:
import pandas as pd

In [100]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data')


In [101]:
df = df.rename(columns= {
    'Recency(months)': 'months_since_last_donation',
    'Frequency(times)': 'number_of_donations',
    'Monetary (c.c.blood)': 'total_volume_donated',
    'Time': 'months_since_first_donation',
    'whether he/she donated blood in March 2007': 'made_donation_in_march_2007'
})

In [102]:
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),made_donation_in_march_2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [103]:
# Print the number of data points in each class
class_counts = df['made_donation_in_march_2007'].value_counts()
print(class_counts)


0    570
1    178
Name: made_donation_in_march_2007, dtype: int64


In [104]:
X = df.drop(columns='made_donation_in_march_2007')
y = df['made_donation_in_march_2007']

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size = 0.25,
                                                   shuffle=True,
                                                   random_state=1)

In [107]:
# Get initial metrics estimated

# Using simple pandas value counts method
print(y_train.value_counts)

#Using sklearn accuracy score
import numpy as np
from sklearn.metrics import accuracy_score

majority_class = y_train.mode([0])
prediction = np.full(shape=y_train.shape,
                    fill_value=majority_class)

accuracy_score(y_train, prediction)

<bound method IndexOpsMixin.value_counts of 352    0
207    0
107    0
17     1
714    0
      ..
645    0
715    0
72     1
235    0
37     1
Name: made_donation_in_march_2007, Length: 561, dtype: int64>


0.7718360071301248

In [108]:
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression

In [109]:
pipeline = make_pipeline(\
                        RobustScaler(),
                        SelectKBest(f_classif),
                        LogisticRegression(solver='lbfgs'))

In [110]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'selectkbest__k': [1, 2, 3, 4],
    'logisticregression__class_weight': [None, 'balanced'],
    'logisticregression__C': [.0001, .001, .01, .1, 10.0, 100.0, 1000.0]
}

gridsearch = GridSearchCV(pipeline, param_grid=param_grid,
                         cv=5, scoring='accuracy', verbose=1)

In [111]:
gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


In [112]:

# Interpret the results.

# Best cross validation score
print('Cross Validation Score:', gridsearch.best_score_)

# Best parameters which resulted in the best score
print('Best Parameters:', gridsearch.best_params_)

# Which features were selected?
selector = gridsearch.best_estimator_.named_steps['selectkbest']
all_names = X_train.columns
selected_mask = selector.get_support()
selected_names = all_names[selected_mask]
unselected_names = all_names[~selected_mask]

print('Features selected:')
for name in selected_names:
    print(name)

print()
print('Features not selected:')
for name in unselected_names:
    print(name)

Cross Validation Score: 0.7807522123893804
Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__class_weight': None, 'selectkbest__k': 4}
Features selected:
Recency (months)
Frequency (times)
Monetary (c.c. blood)
Time (months)

Features not selected:


In [113]:
gridsearch.best_score_

0.7807522123893804

In [114]:
gridsearch.best_params_

{'logisticregression__C': 0.1,
 'logisticregression__class_weight': None,
 'selectkbest__k': 4}

In [115]:

#Get the best model and check it against test data set.

# Predict with X_test features
y_pred = gridsearch.predict(X_test)

# Compare predictions to y_test labels
test_score = accuracy_score(y_test, y_pred)
print('Accuracy Score on test data set:', test_score)

Accuracy Score on test data set: 0.7540106951871658
