In [None]:
i=1234
rev=0
while (int(i)>0):
    temp=int(i)%10
    rev=((rev)*10)+(temp)
    i/=10
print (rev)

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plot
%matplotlib inline
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

In [None]:
X,y= mnist["data"], mnist["target"]
X.shape

In [None]:
y.shape

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

In [None]:
y[0]

In [None]:
y=y.astype(np.uint8)

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Training a Binary Classifier
This binary classifier is only for one digit, lets take '5' here it works as follows :

-True if '5'

-False if 'not 5'

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5) #True for if the digit is 5, false otherwise.

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

#### Taking some performance measures
Here we will be using cross-validation.

Here we have 3 folds that means that we will train the model on 2 of the folds and test it on the third (each time).

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf , X_train , y_train_5 , cv=3 , scoring="accuracy")

In [None]:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [None]:
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

It has over 90% accuracy! This is simply because only about 10% of the
images are 5s, so if you always guess that an image is not a 5, you will be right about
90% of the time.
This demonstrates why accuracy is generally not the preferred performance measure
for classifiers, especially when you are dealing with skewed datasets (i.e., when some
classes are much more frequent than others).




A better method to evaluate the performance of a classifier is to look at the confusion matrix. The general idea is to count the number of times the instance of class 'A' were classified as objects of class 'B'.

For example how many times were the images of digit 5 confused with the images of digit 3.

So, for this first we need to make some predictions.

In [None]:
#here we will use cross_val_predict() function:

from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

#just like cross_val_score() function, cross_val_predict performs K-fold cross-validation,
#but instead of returning the evaluation scores, it returns the predictions made on each test fold.

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix (y_train_5, y_train_pred)

Each row in a confusion matrix represents an actual class and each column represents a predicted class.
but, a perfect classifier would have only true positives and true negatives.

In [None]:
y_train_perfect_predictions=y_train_5
confusion_matrix(y_train_5, y_train_perfect_predictions)

Precision and Recall:

The confusion matrix gives you a lot of information, but sometimes you may prefer a more concise metric. An interesting one to lo0ok at is the accuracy of the positive predictions, also called 'PRECISION'.

precision = TP / TP+FP

A trivial way to have perfect precision is to make one single positive prediction and ensure it is correct (precision = 1/1 = 100%). This would not be very useful since the classifier would ignore all but one positive instance. So precision is typically used along with another metric named recall, also called sensitivity or true positive rate.

recall= TP / TP+FN

In [None]:
#Training a randomforestclassifier and comparing its results to our SGDclassifier

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")

In [None]:
# But to plot a ROC curve, you need scores, not probabilities.
# A simple solution is to use the positive class’s probability as the score:

y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

In [None]:
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")

In [None]:
roc_auc_score(y_train_5, y_scores)
#area under curve for sgd classifier

In [None]:
roc_auc_score(y_train_5, y_scores_forest)
#are under curve for random forest classifier

From the above scores it can be easily concluded that the random forest is a much better classifier than the sgd classifier in tha case of binary classifiers.

## Multiclass Classification

Multiclass Classifiers also known as multinomial classifiers can distinguish between more than two classes.

There are two strategies followed to solve multiclass classification problems:
1. Create a system that can classify the digit images into 10
classes (from 0 to 9) is to train 10 binary classifiers, one for each digit (a 0-detector, a
1-detector, a 2-detector, and so on). Then when you want to classify an image, you get
the decision score from each classifier for that image and you select the class whose
classifier outputs the highest score. This is called the one-versus-all (OvA) strategy 
(also called one-versus-the-rest).

2. train a binary classifier for every pair of digits: one to distin‐
guish 0s and 1s, another to distinguish 0s and 2s, another for 1s and 2s, and so on.
This is called the one-versus-one (OvO) strategy. If there are N classes, you need to
train N × (N – 1) / 2 classifiers. For the MNIST problem, this means training 45
binary classifiers! When you want to classify an image, you have to run the image
through all 45 classifiers and see which class wins the most duels. The main advan‐
tage of OvO is that each classifier only needs to be trained on the part of the training
set for the two classes that it must distinguish

Some algorithms (such as Support Vector Machine classifiers) scale poorly with the
size of the training set, so for these algorithms OvO is preferred since it is faster to
train many classifiers on small training sets than training few classifiers on large
training sets. For most binary classification algorithms, however, OvA is preferred.

In [None]:
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

In [None]:
some_digit_scores=sgd_clf.decision_function([some_digit])

In [None]:
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

In [None]:
sgd_clf.classes_

In [None]:
sgd_clf.classes_[5]

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5
svm_clf.predict([some_digit])

In [None]:
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

In [None]:
svm_clf.classes_

In [None]:
svm_clf.classes_[5]

In [None]:
#Training a random forest classifier:
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])

In [None]:
forest_clf.predict_proba([some_digit])

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled=scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

*Cross validation:

cross validation allows us to compare different machine learning methods and see how well they perform in practice

In [None]:
#Excercise Question
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_scaled, y_train)

In [None]:
cross_val_score(knn, X_train_scaled, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

## MULTI LABEL CLASSIFIERS
Multi-label classification involves predicting zero or more class labels. Unlike normal classification tasks where class labels are mutually exclusive. 

For example if you train a classifier to recognize three faces, Alice, Bob, and Charlie; then
when it is shown a picture of Alice and Charlie, it should output [1, 0, 1] (meaning
“Alice yes, Bob no, Charlie yes”)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
knn_clf.predict([some_digit])

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average="macro")

## Finally testing the model on Test Set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=4, weights='distance')
knn.fit(X_test, y_test)

In [None]:
cross_val_score(knn, X_test, y_test, cv=3, scoring="accuracy")

normal output : array([0.91241752, 0.93939394, 0.95439544])
weights = distance output : array([0.91841632, 0.9429943 , 0.96189619])
weights = uniform output : array([0.91241752, 0.93939394, 0.95439544])

The knn algorithm gives about 96% accurate classifications on our test set