In [1]:
# Classifying Digits using Simple ML models
# Necessary Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Exploring files in the Input Directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Step 1: Convert the data into usable format
# pandas is used to read the contents of csv into a dataframe
train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

# Retrieve the features and label columns into a separate numpy arrays
features = train[train.columns[1:]].values
label = train.label.values

print(type(features))
print(type(label))

print(features.shape)
print(label.shape)


/kaggle/input/digit-recognizer/test.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/sample_submission.csv
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(42000, 784)
(42000,)


**Model 1: **

Logistic Regression

In [2]:
import warnings
warnings.simplefilter("ignore")
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137)
acc = 0.0
global best_model

for jj, (train_index, val_index) in enumerate(kf.split(features)):
    print("Fitting fold", jj+1)
    train_features = features[train_index]
    train_target = label[train_index]
    
    val_features = features[val_index]
    val_target = label[val_index]
    
    model = LogisticRegression(C=20, solver='lbfgs', multi_class='multinomial')
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)
    fold_acc=accuracy_score(val_target, np.argmax(val_pred, axis=1))
    print("Fold accuracy:", accuracy_score(val_target, np.argmax(val_pred, axis=1)))
    #test_preds += model.predict_proba(test)/n_splits
    if(fold_acc>acc):
        acc = fold_acc
        best_model = model
    del train_features, train_target, val_features, val_target
    gc.collect()

    
    

Fitting fold 1
Fold accuracy: 0.9176190476190477
Fitting fold 2
Fold accuracy: 0.9191666666666667
Fitting fold 3
Fold accuracy: 0.911547619047619
Fitting fold 4
Fold accuracy: 0.9133333333333333
Fitting fold 5
Fold accuracy: 0.9157142857142857


In [3]:
print(acc)

0.9191666666666667


In [4]:
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
print(test.columns)
#Retrieve the features and label columns into a separate numpy arrays
test_features = test[test.columns[0:]].values
test_pred = model.predict_proba(test_features)
predict = np.argmax(test_pred, axis=1)

Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=784)


In [5]:
submission = pd.read_csv('../input/digit-recognizer/sample_submission.csv')
submission['Label'] = predict
submission.to_csv('submission.csv', index=False)

**Model 2: **

Logistic Regression with STOCHASTIC GRADIENT DESCENT 

In [None]:
from sklearn.linear_model import SGDClassifier
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137)
acc = 0.0
global best_model

for jj, (train_index, val_index) in enumerate(kf.split(features)):
    print("Fitting fold", jj+1)
    train_features = features[train_index]
    train_target = label[train_index]
    
    val_features = features[val_index]
    val_target = label[val_index]
    
    # loss = 'hinge' represents linear regression
    # log loss implement logistic regression
    model = SGDClassifier(loss='log')
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)
    fold_acc=accuracy_score(val_target, np.argmax(val_pred, axis=1))
    print("Fold accuracy:", accuracy_score(val_target, np.argmax(val_pred, axis=1)))
    #test_preds += model.predict_proba(test)/n_splits
    if(fold_acc>acc):
        acc = fold_acc
        best_model = model
    del train_features, train_target, val_features, val_target
    gc.collect()


Fitting fold 1
Fold accuracy: 0.809047619047619
Fitting fold 2
Fold accuracy: 0.7825
Fitting fold 3


**Model 3:**
RandomForest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=137)
acc = 0.0
global best_model

for jj, (train_index, val_index) in enumerate(kf.split(features)):
    print("Fitting fold", jj+1)
    train_features = features[train_index]
    train_target = label[train_index]
    
    val_features = features[val_index]
    val_target = label[val_index]
    
    # max depth serves as an important hyperparameter
    # When the depth was set with a value of 2, accuracy was down by 60%
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
    model.fit(train_features, train_target)
    val_pred = model.predict_proba(val_features)
    fold_acc=accuracy_score(val_target, np.argmax(val_pred, axis=1))
    print("Fold accuracy:", accuracy_score(val_target, np.argmax(val_pred, axis=1)))
    #test_preds += model.predict_proba(test)/n_splits
    if(fold_acc>acc):
        acc = fold_acc
        best_model = model
    del train_features, train_target, val_features, val_target
    gc.collect()


Fitting fold 1
Fold accuracy: 0.9432142857142857
Fitting fold 2
Fold accuracy: 0.9407142857142857
Fitting fold 3
Fold accuracy: 0.9402380952380952
Fitting fold 4
Fold accuracy: 0.9470238095238095
Fitting fold 5
Fold accuracy: 0.9484523809523809


In [9]:
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
print(test.columns)
#Retrieve the features and label columns into a separate numpy arrays
test_features = test[test.columns[0:]].values
test_pred = model.predict_proba(test_features)
predict = np.argmax(test_pred, axis=1)

Index(['pixel0', 'pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6',
       'pixel7', 'pixel8', 'pixel9',
       ...
       'pixel774', 'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779',
       'pixel780', 'pixel781', 'pixel782', 'pixel783'],
      dtype='object', length=784)


In [10]:
submission = pd.read_csv('../input/digit-recognizer/sample_submission.csv')
submission['Label'] = predict
submission.to_csv('submission.csv', index=False)

**Reference:**
https://www.kaggle.com/tunguz/mnist-logistic-regression-baseline