# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [167]:
import os
from PIL import Image,ImageOps
import numpy as np
import pandas as pd
from IPython.display import display

## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder for the CSV.

In [160]:
path="data/trainingSet/trainingSet/"
tr_directory = os.listdir(path)[1:]
df = []
for dir in tr_directory:
    filenames = os.listdir(f"{path}{dir}")
    for filename in filenames:
        img = Image.open(f"{path}{dir}/{filename}")
        arr=np.array(img).astype(float).flatten()
        arr= np.append(arr,float(dir))
        df.append(arr)
images = pd.DataFrame(df)
images.to_csv('data.csv')


## 3.- Load the CSV

In [161]:
images = pd.read_csv('data.csv')


In [155]:
X,y = images.iloc[:,:-1],images.iloc[:,-1]


(42000,)

In [170]:
imager = X.iloc[4,:].values
imager = imager.reshape((28,28))
icon = Image.fromarray(imager)
%matplotlib inline
from matplotlib.pyplot import imshow


## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.2,random_state=0)


## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

In [80]:
trees = {
    'Decision tree': {
        'model': DecisionTreeClassifier(),
        'params':{
            'max_depth':[None,10]

        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params':{
            'n_estimators':[100,200]
        }
    },
    'Ada Boost': {
        'model': AdaBoostClassifier(),
        'params':{
            'n_estimators':[50,100]
            
        }
    },
    'SVC': {
        'model': SVC(),
        'params':{
            'C':[1,10],
            'kernel':['rbf','poly']
            
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params':{
            'n_neighbors':[5,10],
            'p':[1,2]
        }
    }
}
results = []
for name,param in trees.items():
    grid = GridSearchCV(param['model'],param['params'],cv=3)
    grid.fit(X_train,y_train)
    results.append({
        'model':name,
        'best score':grid.best_score_,
        'best parameters':grid.best_params_
    })
    print(f'hyperparameter tuning for {name} has been completed')


hyperparameter tuning for Decision tree has been completed
hyperparameter tuning for Random Forest has been completed
hyperparameter tuning for Ada Boost has been completed
hyperparameter tuning for SVC has been completed
hyperparameter tuning for KNN has been completed


In [81]:
resultss = pd.DataFrame(results)
resultss

Unnamed: 0,model,best score,best parameters
0,Decision tree,0.822827,{'max_depth': 10}
1,Random Forest,0.957411,{'n_estimators': 200}
2,Ada Boost,0.72125,{'n_estimators': 100}
3,SVC,0.976935,"{'C': 10, 'kernel': 'rbf'}"
4,KNN,0.961369,"{'n_neighbors': 5, 'p': 2}"


## Optional: Can you rotate an image?

In [169]:
image = Image.open('data/testSample/img_5.jpg')
image.rotate(45).show()
ImageOps.flip(image)
ImageOps.mirror(image)