In [0]:
import pandas as pd
import numpy as np

In [0]:
from sklearn.datasets import fetch_openml
dataset = fetch_openml('mnist_784',version=1)
dataset.keys()

In [0]:
x,y = dataset['data'],dataset['target']
print(x.shape)
print(y.shape)

In [0]:
import matplotlib as mpl
import matplotlib.pyplot as plt

digit = x[0]
digit_image = digit.reshape(28,28)

plt.imshow(digit_image, cmap=mpl.cm.binary,interpolation='nearest')
plt.axis('off')
plt.show()

In [0]:
y[0]

In [0]:
y = y.astype(np.uint8)

In [0]:
x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]

In [0]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [0]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier()
sgd_clf.fit(x_train,y_train_5)

In [0]:
sgd_clf.predict([digit])

In [0]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sgd_clf,x_train,y_train_5,cv=5,scoring='accuracy')

In [0]:
scores

In [0]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self,x,y=None):
        pass
    def predict(self,x):
        return np.zeros((len(x),1),dtype=bool)
    
b_clf = Never5Classifier()

cross_val_score(b_clf,x_train,y_train_5,cv=3,scoring='accuracy')

In [0]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf,x_train,y_train_5,cv=3)

In [0]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5,y_train_pred)

In [0]:
y_train_perf = y_train_5
confusion_matrix(y_train_5,y_train_perf)

In [0]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train_5,y_train_pred)


In [0]:
recall_score(y_train_5,y_train_pred)

In [0]:
from sklearn.metrics import f1_score

f1_score(y_train_5,y_train_pred)

In [0]:
y_scores = cross_val_predict(sgd_clf, x_train, y_train_5,cv=3,
                            method = 'decision_function')

In [0]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)


In [0]:
def plot_precision_recall_vs_threshold(precisions,recalls,thresholds):
    plt.plot(thresholds, precisions[:-1],'b--',label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-',label='Recall')

plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [0]:
threshold_90_precision = thresholds[np.argmax(precisions >=0.90)]

In [0]:
y_train_pred_90 = (y_scores >= threshold_90_precision)

In [0]:
precision_score(y_train_5, y_train_pred_90)

In [0]:
recall_score(y_train_5, y_train_pred_90)

ROC is the ratio of true positive rate and false negative rate

In [0]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [0]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2,label=label)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr)
plt.show()

In [0]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

In [0]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
y_proba_forest = cross_val_predict(forest_clf, x_train,y_train_5, cv=3,
                                  method='predict_proba')

In [0]:
y_scores_forest = y_proba_forest[:,1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

In [0]:
plt.plot(fpr, tpr,'b:',label='SGD')
plt.plot(fpr_forest, tpr_forest, 'b--','Random Forest')
plt.legend(loc='lower right')
plt.show()

In [0]:
roc_auc_score(y_train_5,y_scores_forest)

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train.astype(np.float64))


In [0]:
y_train_pred = cross_val_predict(sgd_clf, x_train,y_train,cv=3)
conf_mx = confusion_matrix(y_train,y_train_pred)
conf_mx

In [0]:
plt.matshow(conf_mx,cmap=plt.cm.gray)
plt.show()

Q1. 

In [0]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

knn_clf.fit(x_train, y_train)

y_pred = knn_clf.predict(x_test)

In [0]:
f1_score(y_test, y_pred, average='weighted')

In [0]:
params = {'weights':('uniform','distance'),'n_neighbors':(1,2,3,4,5,6,7)}

In [0]:
knn_clf = KNeighborsClassifier()

In [0]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,params, cv=3, scoring='accuracy')

grid_search.fit(x_train, y_train)



In [0]:
grid_search.best_params_

In [0]:
final_model = grid_search.best_estimator_

In [0]:
y_pred = final_model.predict(x_test)

In [0]:
f1_score(y_test, y_pred)

Q2. 


In [0]:
from scipy.ndimage.interpolation import shift

def shift_image(image, dx, dy):
  image = image.reshape((28,28))
  shifted_image = shift(image, [dx, dy], cval=0, mode='constant')
  return shifted_image([-1])


    

In [0]:
x_train_augmented = [image for image in x_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1,0),(0,1),(-1,0),(0,-1)):
  for image , label in zip(x_train, y_train):
    x_train_augmented.append(shift_image(image, dx, dy))
    y_train_augmented.append(label)

x_train_augmented = np.array(x_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [0]:
shuffle_id = np.random.permutation(len(x_train_augmented))
x_train_augmented = x_train_augmented[shuffle_id]
y_train_augmented = y_train_augmented[shuffle_id]

In [0]:
knn_clf = KneighborsClassifier(**grid_search.best_params_)

In [0]:
knn_clf.fit(x_train_augmented, y_train_augmented)


In [0]:
y_pred = knn_clf.predict(x_test)

In [0]:
f1_score(y_test, y_pred)