# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [77]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor

## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder for the CSV.

In [61]:

# Get the list of all files and directories
# in the root directory
path = "D:/Machine_Learning/Machine_Learning/12. Images/Data/trainingSet/trainingSet/"
dir_list = os.listdir(path)

df1 = pd.DataFrame()

for file in dir_list:
    imgs = os.listdir(path+file)
    arr = np.zeros((len(imgs),785))
    for i,img in enumerate(imgs):
        imag = Image.open(path+file+'/'+img)
        arry = np.array(imag,dtype=float)
        arry = arry.flatten()
        arr[i,:784]=arry
        arr[i,784]=int(file)
    df2 = pd.DataFrame(data=arr)
    df = pd.concat([df1,df2])
    df1 = df
    print(df.shape)
    #df[file] = arr

(4132, 785)
(8816, 785)
(12993, 785)
(17344, 785)
(21416, 785)
(25211, 785)
(29348, 785)
(33749, 785)
(37812, 785)
(42000, 785)


In [72]:
print(df.tail())

      0    1    2    3    4    5    6    7    8    9    ...  775  776  777  \
4183  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0  ...  0.0  0.0  0.0   
4184  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
4185  0.0  3.0  6.0  1.0  0.0  2.0  2.0  0.0  0.0  7.0  ...  7.0  0.0  0.0   
4186  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4187  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  3.0   

      778  779  780  781  782  783  784  
4183  0.0  0.0  0.0  0.0  0.0  0.0  9.0  
4184  0.0  0.0  0.0  0.0  0.0  0.0  9.0  
4185  1.0  0.0  0.0  0.0  0.0  0.0  9.0  
4186  0.0  0.0  0.0  0.0  0.0  0.0  9.0  
4187  0.0  0.0  0.0  0.0  0.0  0.0  9.0  

[5 rows x 785 columns]


## 3.- Load the CSV

In [54]:
df.to_csv("MNIST_data.csv",index=False)

## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [79]:
#final_data = data
# df_new = pd.read_csv("MNIST_data.csv")
y = df.iloc[:,-1]
x = df.iloc[:,:-1]

In [81]:


x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y,
                                    test_size=0.2,
                                    random_state=0  # Recommended for reproducibility
                                )

transformer = preprocessing.PowerTransformer()
y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_test = transformer.transform(y_test.values.reshape(-1,1))

rang = abs(y_train.max()) + abs(y_train.min())

num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),
])

cat_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder()) # handle_unknown='ignore' ONLY IN VERSION 0.24
])

tree_classifiers = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
}
### END SOLUTIONv

tree_classifiers = {name: pipeline.make_pipeline( model) for name, model in tree_classifiers.items()}

results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_test)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_test, pred),
                              "MAB": metrics.mean_absolute_error(y_test, pred),
                              " % error": metrics.mean_squared_error(y_test, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

## Optional: Can you rotate an image?