In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import dataset

Name: Heart Disease UCI

Source: Kaggle

https://www.kaggle.com/ronitf/heart-disease-uci/downloads/heart-disease-uci.zip/1

Attribute Information: 
> 1. age 
> 2. sex 
> 3. chest pain type (4 values) 
> 4. resting blood pressure 
> 5. serum cholestoral in mg/dl 
> 6. fasting blood sugar > 120 mg/dl
> 7. resting electrocardiographic results (values 0,1,2)
> 8. maximum heart rate achieved 
> 9. exercise induced angina 
> 10. oldpeak = ST depression induced by exercise relative to rest 
> 11. the slope of the peak exercise ST segment 
> 12. number of major vessels (0-3) colored by flourosopy 
> 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [None]:
# read the heart_cleaned data
df = ...

df.head()

In [None]:
df.info()

# Data preparation

In [None]:
factor_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "target", "thal"]
numeric_columns = ["age", "trestbps", "chol", "thalach", "oldpeak"]

# Supervised Learning

## Regression 

In this exercise we will try to predict the cholesterol of a person based on the other variables.

Since it's a continuous variable, it's a perfect regression problem

As usual, let's start by preparing our target variable

In [None]:
target = "age"

In [None]:
# Isolate the target from the rest of the data in y
y = ...
X = ...
print(X.head())

# get a list of the columns
feature_list = ...

In [None]:
for col in numeric_columns:
    if col != target:
        plt.plot(X[col], y, 'o')
        plt.title(col)
        plt.xlabel(col)
        plt.ylabel(target)
        plt.show()

Then we need to split the data into a training and testing set to be able to evaluate the quality of our model and its generalizability

hint: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# import train_test_split from sklearn
from sklearn.... import ...

# Split dataset into training and testing part
...

# Display number of rows and columns for both training and testing sets
...
...

### Multiple Linear Regression

Now let's train our model (hint: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [None]:
# import Linear regression from sklearn 
from sklearn.... import ...

# Instantiate model
model = ...

# Fit the model to the training set
...

It's time to evaluate the model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Create a function that compute the MAE, MSE and R2 from actual and predicted values
def evaluate_regression(actuals, preds):
    
    print("RMSE = {:.1f}".format(...))
    print("MAE = {:.1f}".format(...))

    print("\nR² = {:.2f}\n\n".format(...))

In [None]:
# Create predictions for training set
y_pred_train = ...

# Apply your function to see the performance
print("In sample evaluation \n---------------------")
...

In [None]:
# Create predictions for test set
y_pred = 

# Apply your function to see the performance
print("Out sample evaluation \n---------------------")
...

We can also use PCA to determine the optimal number of components to get the best performance (optional)

In [None]:
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

model = LinearRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('linear', model)])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'pca__n_components': np.arange(1,13)
}
search = GridSearchCV(pipe, param_grid, iid=False, cv=5,
                      return_train_score=False)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

# Plot the PCA spectrum
pca.fit(X_train)

fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6))
ax0.plot(pca.explained_variance_ratio_, linewidth=2)
ax0.set_ylabel('PCA explained variance')

ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')
ax0.legend(prop=dict(size=12))

# For each number of components, find the best classifier results
results = pd.DataFrame(search.cv_results_)
components_col = 'param_pca__n_components'
best_clfs = results.groupby(components_col).apply(
    lambda g: g.nlargest(1, 'mean_test_score'))

best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
               legend=False, ax=ax1)
ax1.set_ylabel('Classification accuracy (val)')
ax1.set_xlabel('n_components')

plt.tight_layout()
plt.show()

### Gradient Boosting Regressor

hint: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate the model
model = ...

# Fit the model
...

# make predictions
preds = ...

# Evaluate performance
...

In [None]:
# Get the feature importance
feature_imp = ...
print("Most important features : \n",feature_imp)

# Plot the feature importance 
...
plt.show()

## Classification

We now turn to a classification problem. We will try to predict the target value (0 or 1) 

In [None]:
target = "target"

In [None]:
for y in numeric_columns:
    sns.boxplot(x = target, y = y, data=df, orient="v")
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for col in factor_columns:
    if col != "target":
        print("--- {} ---".format(col))
        display(pd.crosstab(df[target], df[col]).style.background_gradient())

Take some time to analyze the plots above and draw some conclusions

In [None]:
# Isolate the target from the rest of the data in y
y = ...
X = ...
print(X.head())

# get a list of the columns
feature_list = ...

First it's important to isolate the target variable

In oreder to get better estimates, let's normalize the data

Then we need to split the data into a training and testing set to be able to evaluate the quality of our model and its generalizability

In [None]:
# Split dataset into training and testing part
...

# Display number of rows and columns for both training and testing sets
...
...

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score

# Create a function that compute the accuracy, precision, recall, AUC and confusion matrix from actual and predicted values
def evaluate_classification(actuals, preds):
    
    print("Accuracy = {:.1%}".format(...))
    print("Precision = {:.1%}".format(...))
    print("Recall = {:.1%}".format(...))

    print("AUC = {:.1%}".format(...))

    print("\nConfusion matrix: \n",...)
    
    print("\n")

### Logistic regression

hint: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
from sklearn.... import ...

# instantiate and fit the model to the training set
...
...

# make predictions
...

#evaluate the model on the training set using your user defined function
print("In sample evaluation \n---------------------")
...

# evaluate the model on the test set using your user defined function
print("Out sample evaluation \n---------------------")
...

In [None]:
# compute the score of each row and plot it along with the target prediction (hint: sort data so it's cleaner) to see the sigmoid
proba = ...
plt.scatter(range(len(proba)),...)
plt.scatter(range(len(preds)), ...)
plt.show()

### Random Forest

Now let's train our model

hint https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.... import ...


# instantiate and fit the model to the training set
...
...

# make predictions
...

#evaluate the model on the training set using your user defined function
print("In sample evaluation \n---------------------")
...

# evaluate the model on the test set using your user defined function
print("Out sample evaluation \n---------------------")
...

This is not too bad (in practice we would accept the model, depending on the context and risks, if recall and precision are smaller

However we have here an illustration of overfitting. Indeed the performances on the training set are much better than on the test set.

Let's now dig into the results of the model and try to understand what happened

In [None]:
# collect the feature importance from the model and sort by importance
feature_imp = ...
print("Most important features : \n",feature_imp)

%matplotlib inline
# plot a barplot of the features by decreasing importance (put a title + labels)
...
...
plt.show()

Let's change some parameters to try to improve performances on the test set

In [None]:
# Feel free to go in the documentation and change some of the parameters to the model to improve the performances

...

We could continue to try sets of parameters like this for a big amount of time but this is not very efficient, especially when you start with machine learning. 

Another more research-like approach is called GridSearch. This method is based on the principle that the user should create a grid of regularly spaced parameters and then test the model on each node of that grid and compare. This is a more systematic method but as you can imagine it can take a lot of time if you want to test a long list of combinations for a large datasets. That's why we also often use randomized gridsearch, where instead of running the model for all points of the grid, we only do it for some random combinations. 

In the next cell you will implement gridsearch yourself thanks to sklearn (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [None]:
from sklearn.... import ...

# create a dictionnary of 2 parameters with some range. Be careful not to create too many combinations of it will take time to run it
...

rf = RandomForestClassifier()

# Instantiate the gridsearchCV using your previously defined parameters dictionary. 
...

# fit the gridsearch object to the training set
...

print("Best parameters= {}".format(...))
print("Best score= {}".format(...))

# get the results of the gridsearch in a dataframe format and print by descending order of "mean_test_score"
results = ...
print(...)

# Display the grid points and highlight the point that gave the best model performance (don't forget a title, labels and legend)
...
...

# get the best estimator, use it to make predictions and evaluate the performances on the test set
best_model = ...

y_pred = ...

print("Out sample evaluation \n---------------------")
...

(Optional) Feel free to experiment with other ML models defined in sklearn to get comfortable with the model creation and analysis.
Here are some ideas:
* KNN
* Naive Bayes 
* SVM

### Neural network

Neural networks are very popular these days. 

So we will conclude this training by giving you the opportunity to create your own fully connected neural network as well

In [None]:
# Don't modify this - just run it
def performance_nn(model):
    history = model.fit(X_train, y_train, 
          epochs=500, # to be modified
          validation_split = 0.3,
            verbose=0)

    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['training', 'validation'], loc='upper left')
    plt.show()

    results = model.evaluate(X_train, y_train)
    print("Training Loss = {:.2f}, \t accuracy = {:.2f} \n".format(results[0], results[1]))

    results = model.evaluate(X_test, y_test)
    print("Testing Loss = {:.2f}, accuracy  = {:.2f} \n".format(results[0], results[1],))

    nn_preds = model.predict(X_test)

hint: https://keras.io/layers/core/

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()
# add dense layers to the model (you can use any number of layers you want and as many nodes you like) 
...
...

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

performance_nn(model);

#### Image recognition

To conclude this workshop, we will show a very basic and simple example of image recognition.

In the previous section you created a first neural network with a sequential fully connected topology. But there are many more topologies which are each best suited to a particular kind of machine learning problem. For image recognition for example convolutional neural networks have been proved to work best. We won't implement such CNN here because this will be done in a more advanced training in the future but we will just explore the basics of image recognition with sklearn.

To do so we will first load a different dataset containing small imaged of digits. 

In [None]:
from sklearn import datasets

df = datasets.load_digits()

In [None]:
images_and_labels = list(zip(df.images, df.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

In [None]:
n_samples = len(df.images)
data = df.images.reshape((n_samples, -1))

In [None]:
from sklearn import datasets, neural_network, metrics

# Adapt the netword topology (number of layers and number of nodes)
classifier = neural_network.MLPClassifier(activation="relu",
                                          hidden_layer_sizes = (...),
                                          max_iter = 3000,
                                          random_state = 42,
                                          verbose=False)

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], df.target[:n_samples // 2])

# Now predict the value of the digit on the second half:
expected = df.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

images_and_predictions = list(zip(df.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:8]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

plt.show()

plt.plot(np.log(classifier.loss_curve_))
plt.ylabel("Log Loss")
plt.xlabel("Epoch")
plt.title("Learning curve");

# Going further

This introductory workshop to machine learning showed you how - easy - creating predictive models can be.

However note that we used a very simple dataset, with few cleaning steps and only numerical variables.

In practice we often need to go back and forth between modeling and cleaning until we achieve desired results.

Also model tuning and refinement might require some time with big datasets.

Here is a list of topic you could be interested in if you want to go further in unsupervised learning

* up/downsampling 
* Cross valisation & randomized gridsearch
* Advanced deep learning : RNN, CNN, GAM, LSTM, ...