In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import seaborn as sns
import numpy as np

# Cancer Inhibitors dataset
Following dataset is coming from https://www.kaggle.com/xiaotawkaggle/inhibitors

On Leho there is an adapted csv of the cdk2.h5 dataset. The adaptations I already did for you was extracting the data out of the .h5 file. Renaming the columns, add in the IDs of each of the compounds (row names) and combine it with the target into a .csv file.

For cdk2, 1890 potential inhibitors are collected from chembl database, in which molecules with IC50 lower than 10 uM are usually considered as inhibitors, otherwise non-inhibitors.

The rownames represent the CHEMBL Id, which can be used to search the CHEMBL database: https://www.ebi.ac.uk/chembl/

The features are split into 3 categories that represent three different sets of molecular fingerprints calculated by RDKIT. For more information: http://www.rdkit.org/docs/GettingStartedInPython.html#topological-fingerprints
* ap: Atom Pairs
* mg: Morgan Fingerprints (Circular Fingerprints)
* tt: Topological Torsions

The actual values represent if a molecular fingerprint is present in a specific molecule or not (1: present, 0: not present)

The target column indicated if the molecule is considered an inhibitor or not.

In [None]:
# Read in the data, because we have row names in our dataset, use pd.read_csv('filename.csv',index_col=0)
cdk2_data = pd.read_csv("../exercises/data/cdk2.csv", index_col=0)
# Take a look at the first rows of the dataframe
cdk2_data.head()

In [None]:
# Take a look at the number of rows and columns in the dataframe
print("rows, columns", cdk2_data.shape)

In [None]:
# Plot the counts of the target column using a seaborn countplot
sns.countplot(data=cdk2_data,x="target")
# This is a little bit unbalanced, but we will not look at this

In [None]:
# Plot a countplot of the first feature column (ap_1)
sns.countplot(data=cdk2_data,x="ap_1")
# This looks strange, and is not optimal. But, we will leave this like it is. 

**What is noteworthy of the plot above?**



**Notes:**
Due to the amount of columns, a df.describe() will not show you a lot of information.

Pairplots and corrplots would be gigantic and also not very informative due to it being sparse data.

Removing outliers would not be a good idea, since a lot of the values that are different from 0 would already be considered outliers, whilst these are just the informative things.
    

In [None]:
# Split into features and targets
X=cdk2_data.drop("target",axis=1)
y=cdk2_data["target"]

In [None]:
# Split into training and test set. Keep +-20% as a test_set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
# Create a multinomial naive bayes classifier
model = MultinomialNB(alpha=1)

# Fit the training data
model.fit(X_train,y_train)

# Show the score of the test data
print(model.score(X_test,y_test))

In [None]:
%%time
# Predict values for the test set
model.predict(X_test)

# Print a confusion matrix and classification report
from sklearn.metrics import confusion_matrix,accuracy_score,ConfusionMatrixDisplay,recall_score,precision_score,f1_score
print(cf:=confusion_matrix(y_test,model.predict(X_test)))
matrix = ConfusionMatrixDisplay(cf,display_labels=model.classes_)
matrix.plot()
plt.show()

from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test)))

In [None]:
%%time
# Train a logistic regression model on the data and test it as you did for the naive bayes model
# I had a warning about iterations. max_iter = 100 default. Increasing this parameter will get rid of the warning
logReg_model = LogisticRegression(C=1,max_iter=1000) 
logReg_model.fit(X_train,y_train)
print(logReg_model.score(X_test,y_test))
logReg_model.predict(X_test)

print(cf:=confusion_matrix(y_test,logReg_model.predict(X_test)))
matrix = ConfusionMatrixDisplay(cf,display_labels=logReg_model.classes_)
matrix.plot()
plt.show()

print(classification_report(y_test, logReg_model.predict(X_test)))

Also try and play with the hyperparameters a bit

## Conclusion
**What can we conclude regarding the metrics of both models**


**What can we conclude regarding the time it takes to train the model?**



# MNIST dataset
MNIST is the "hello world" of AI.

MNIST is a dataset containing a lot of handwritten digits. The goal is to correctly identify the written digit from the image. There is a lot of different sets and subsets available online. The data that we will be using is coming from
https://www.kaggle.com/c/digit-recognizer/data

Each image contains 28x28 pixels => 784 values between 0 (black) and 255 (white)

Some initial code is given as to show you how to display the images

In [None]:
# Read in the dataset
df = pd.read_csv("data/MNIST.csv")

In [None]:
# Take a look at the first rows of the dataset
df.head()

In [None]:
# Countplot of the numbers
sns.countplot(x="label",data=df)

In [None]:
# Split into features and targets
X = df.drop("label",axis=1)
y = df.label

In [None]:
# Change this value to see another sample
row_num_X = 3

# Extract the image from X
single_image= X.iloc[row_num_X,:]  ## Row row_num_X, column all (:)

# Convert the single array of 784 pixels into nested arrays as to creat a 28x28 array
# [p0,p1,p2,p3,...,p784] becomes

# [[p0,p1,p2,p3,...,p27]
#  [p28,p29,p30,    ...]
#  ...
#  [          ..., p783]]
single_image_to_28x28 = single_image.values.reshape(28,28)

# Show the image
plt.imshow(single_image_to_28x28,cmap='Greys')
# Also print the corresponding label
print(f"Label: {y[row_num_X]}")

In [None]:
# Split into training and test set. Keep +-20% as a test_set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
%%time
# Train a multinomial naive bayes model on the data and test it using the test data
model = MultinomialNB(alpha=1)
model.fit(X_train,y_train)
print(model.score(X_test,y_test))
print(model.score(X_train,y_train))

In [None]:
# Predict values for the test set
model.predict(X_test)
# Print a confusion matrix and classification report
print(cf:=confusion_matrix(y_test,model.predict(X_test)))
matrix = ConfusionMatrixDisplay(cf,display_labels=model.classes_)
matrix.plot()
plt.show()

print(classification_report(y_test, model.predict(X_test)))

In [None]:
# Before performing logistec regression, scaling the data, because it took to long and took too much itetations, even with warnings
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
scaler = StandardScaler()
scaler.fit(X_train) # niet op volledige data doen, anders test je geen nieuwe data in je scaler.fit
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
%%time
# Also try it using a logistic regression model
model = LogisticRegression(C=1, max_iter=1000)
model.fit(X_train,y_train)
print("Score of the testset:",model.score(X_test,y_test))
print("Score of the trainingset:", model.score(X_train,y_train))
model.predict(X_test)

In [None]:
print(cf:=confusion_matrix(y_test,model.predict(X_test)))
matrix = ConfusionMatrixDisplay(cf,display_labels=model.classes_)
matrix.plot()
plt.show()

print(classification_report(y_test, model.predict(X_test)))

Following code let's you draw a number and predict it using your model

The model should be called 'model' or be renamed in the code below

Note that you need packages pillow and tkinter

`conda install pillow tk` or `pip3 install pillow tk` or `sudo dnf install python3-pillow python3-tkinter`

In [None]:
from PIL import ImageTk, Image, ImageDraw, ImageChops
import PIL
from tkinter import *

width = 280  # canvas width
height = 280 # canvas height
center = height//2
white = (255) # canvas back

def predict():
    op2 = ImageChops.invert(output_image)
    op2=op2.resize((28,28),resample=3)
    # Show the image
    plt.imshow(op2,cmap='Greys')
    op2_array = np.asarray(op2).reshape(1,-1)
    print(model.predict(op2_array))
    
def clear():
    plt.close()
    canvas.delete("all")
    draw.rectangle((0,0,width,height),fill=255)
    
def paint(event):
    x1, y1 = (event.x - 1), (event.y - 1)
    x2, y2 = (event.x + 1), (event.y + 1)
    canvas.create_oval(x1, y1, x2, y2, fill="black",width=25)
    draw.line([x1, y1, x2, y2],fill="black",width=25)

master = Tk()

# create a tkinter canvas to draw on
canvas = Canvas(master, width=width, height=height, bg='white')
canvas.pack()

# create an empty PIL image and draw object to draw on
output_image = PIL.Image.new("L", (width, height), white)
draw = ImageDraw.Draw(output_image)
canvas.pack(expand=YES, fill=BOTH)
canvas.bind("<B1-Motion>", paint)

# add a button to save the image
button=Button(text="predict",command=predict)
button.pack()
button=Button(text="clear",command=clear)
button.pack()

master.mainloop()

In [None]:
from PIL import ImageChops