In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score,recall_score,f1_score,precision_score,roc_auc_score,log_loss,classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
mnist=fetch_openml('mnist_784',version=1,as_frame=False)
X,y=mnist["data"],mnist["target"]

In [3]:
mnist

{'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 'target': array(['5', '0', '4', ..., '4', '5', '6'], dtype=object),
 'frame': None,
 'categories': {'class': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']},
 'feature_names': ['pixel1',
  'pixel2',
  'pixel3',
  'pixel4',
  'pixel5',
  'pixel6',
  'pixel7',
  'pixel8',
  'pixel9',
  'pixel10',
  'pixel11',
  'pixel12',
  'pixel13',
  'pixel14',
  'pixel15',
  'pixel16',
  'pixel17',
  'pixel18',
  'pixel19',
  'pixel20',
  'pixel21',
  'pixel22',
  'pixel23',
  'pixel24',
  'pixel25',
  'pixel26',
  'pixel27',
  'pixel28',
  'pixel29',
  'pixel30',
  'pixel31',
  'pixel32',
  'pixel33',
  'pixel34',
  'pixel35',
  'pixel36',
  'pixel37',
  'pixel38',
  'pixel39',
  'pixel40',
  'pixel41',
  'pixel42',
  'pixel43',
  'pixel44',
  'pixel45',
  'pixel46',
  'pixel47

In [4]:
df=pd.DataFrame(X)

In [5]:
df['label']=y

In [6]:
X=df.drop("label",axis=1)
y=df["label"]

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [8]:
import numpy as np

# Flatten all pixel values into one long array
all_pixels = X_train.values.flatten()

# Find the most frequent pixel value
unique, counts = np.unique(all_pixels, return_counts=True)
most_common_value = unique[np.argmax(counts)]

print("Most common pixel value:", most_common_value)


Most common pixel value: 0


In [9]:
zero_cols = X_train.columns[(X_train == 0).all()]
print("Columns that are always zero:", zero_cols.tolist())
print("Total always-zero columns:", len(zero_cols))
empty_cols=zero_cols.tolist()

Columns that are always zero: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 52, 53, 54, 55, 56, 57, 82, 83, 84, 85, 111, 112, 113, 140, 168, 476, 560, 644, 671, 672, 673, 699, 700, 701, 727, 728, 729, 730, 754, 755, 756, 757, 758, 759, 780, 781, 782, 783]
Total always-zero columns: 66


In [10]:
X_train_new=X_train.drop(empty_cols,axis=1)
X_test_new=X_test.drop(empty_cols,axis=1)

In [11]:
cols=X_train_new.columns

In [12]:
preprocessor=ColumnTransformer(
    transformers=[("num",StandardScaler(),cols)
                 ],remainder="passthrough"
)

In [13]:
X_train_transformed=pd.DataFrame(preprocessor.fit_transform(X_train_new),columns=X_train_new.columns)
X_test_transformed=pd.DataFrame(preprocessor.transform(X_test_new),columns=X_test_new.columns)

In [14]:
X_train_transformed.describe()

Unnamed: 0,12,13,14,15,32,33,34,35,36,37,...,770,771,772,773,774,775,776,777,778,779
count,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,...,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0
mean,-6.344131999999999e-19,1.776357e-18,-3.1720659999999997e-19,-3.1720659999999997e-19,-1.332268e-18,-2.537653e-19,2.6645350000000002e-18,-1.966681e-18,-3.172066e-18,6.978545e-18,...,-1.3830210000000002e-17,-4.440892e-18,1.344956e-17,-6.597897e-18,-6.471014e-18,-2.283887e-18,5.075304999999999e-19,5.075305e-18,3.933362e-18,-4.82154e-18
std,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,...,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009,1.000009
min,-0.004573148,-0.005956811,-0.004225809,-0.004225809,-0.004225809,-0.004874994,-0.008045948,-0.01060983,-0.01264212,-0.01747896,...,-0.05746607,-0.05566708,-0.05051134,-0.04149041,-0.03319377,-0.02359472,-0.01712645,-0.01087254,-0.009110351,-0.005972932
25%,-0.004573148,-0.005956811,-0.004225809,-0.004225809,-0.004225809,-0.004874994,-0.008045948,-0.01060983,-0.01264212,-0.01747896,...,-0.05746607,-0.05566708,-0.05051134,-0.04149041,-0.03319377,-0.02359472,-0.01712645,-0.01087254,-0.009110351,-0.005972932
50%,-0.004573148,-0.005956811,-0.004225809,-0.004225809,-0.004225809,-0.004874994,-0.008045948,-0.01060983,-0.01264212,-0.01747896,...,-0.05746607,-0.05566708,-0.05051134,-0.04149041,-0.03319377,-0.02359472,-0.01712645,-0.01087254,-0.009110351,-0.005972932
75%,-0.004573148,-0.005956811,-0.004225809,-0.004225809,-0.004225809,-0.004874994,-0.008045948,-0.01060983,-0.01264212,-0.01747896,...,-0.05746607,-0.05566708,-0.05051134,-0.04149041,-0.03319377,-0.02359472,-0.01712645,-0.01087254,-0.009110351,-0.005972932
max,235.7666,180.27,236.6411,236.6411,236.6411,233.2858,167.2259,129.6408,93.30416,87.89233,...,23.21779,24.12091,26.88485,32.76488,42.48352,58.01052,91.54796,160.7847,146.4153,172.8109


In [15]:
clf1=LogisticRegression(max_iter=100000)

In [16]:
clf1.fit(X_train_transformed,y_train)

In [17]:
y_pred=clf1.predict(X_test_transformed)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1381
           1       0.95      0.96      0.96      1575
           2       0.92      0.89      0.90      1398
           3       0.89      0.89      0.89      1428
           4       0.93      0.91      0.92      1365
           5       0.87      0.87      0.87      1263
           6       0.94      0.95      0.94      1375
           7       0.92      0.94      0.93      1459
           8       0.89      0.88      0.89      1365
           9       0.90      0.89      0.89      1391

    accuracy                           0.92     14000
   macro avg       0.92      0.92      0.92     14000
weighted avg       0.92      0.92      0.92     14000



In [19]:
confusion_matrix(y_test,y_pred)

array([[1333,    0,    5,    2,    6,   11,    9,    2,   12,    1],
       [   1, 1519,    7,   11,    1,    8,    2,   10,   14,    2],
       [  13,   20, 1243,   21,   13,   12,   24,   20,   25,    7],
       [  10,    6,   31, 1268,    4,   53,    3,   17,   25,   11],
       [   2,    7,   14,    4, 1243,    1,   18,    9,   11,   56],
       [  12,    4,   11,   45,    6, 1103,   24,    7,   38,   13],
       [  13,    5,   12,    0,   12,   16, 1310,    0,    7,    0],
       [   5,    8,   15,    9,   11,    2,    0, 1370,    4,   35],
       [  11,   24,   11,   38,    6,   43,    9,    4, 1200,   19],
       [   8,   12,    3,   22,   37,   12,    1,   47,    8, 1241]])

In [20]:
clf2=RandomForestClassifier()

In [21]:
clf2.fit(X_train_transformed,y_train)

In [22]:
y_pred2=clf2.predict(X_test_transformed)

In [23]:
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1381
           1       0.98      0.98      0.98      1575
           2       0.96      0.97      0.97      1398
           3       0.96      0.96      0.96      1428
           4       0.97      0.96      0.97      1365
           5       0.97      0.96      0.97      1263
           6       0.97      0.98      0.98      1375
           7       0.97      0.97      0.97      1459
           8       0.96      0.96      0.96      1365
           9       0.95      0.95      0.95      1391

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97     14000



In [24]:
confusion_matrix(y_test,y_pred2)

array([[1369,    1,    1,    1,    1,    1,    3,    0,    3,    1],
       [   0, 1549,    7,    8,    1,    1,    3,    5,    0,    1],
       [   9,    1, 1351,    4,    5,    0,    5,   12,   11,    0],
       [   2,    1,   13, 1370,    1,   15,    2,   12,    9,    3],
       [   3,    3,    3,    0, 1315,    0,    8,    2,    2,   29],
       [   4,    1,    1,   16,    0, 1212,    9,    2,   10,    8],
       [   9,    2,    2,    0,    2,    4, 1350,    0,    6,    0],
       [   1,    5,   15,    0,   11,    0,    0, 1415,    0,   12],
       [   3,    3,    6,   12,    2,    9,    3,    1, 1308,   18],
       [   6,    7,    1,   18,   18,    2,    2,    9,   11, 1317]])

In [25]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5]
}

In [26]:
rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1)
grid.fit(X_train_transformed, y_train)

In [27]:
print("Best params:", grid.best_params_)
print("Best accuracy:", grid.best_score_)

Best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy: 0.9667679052730329


In [28]:
best_model = grid.best_estimator_

In [29]:
y_pred3 = best_model.predict(X_test_transformed)

In [30]:
print(classification_report(y_test,y_pred3))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1381
           1       0.98      0.99      0.98      1575
           2       0.96      0.96      0.96      1398
           3       0.96      0.96      0.96      1428
           4       0.97      0.96      0.97      1365
           5       0.97      0.95      0.96      1263
           6       0.97      0.98      0.98      1375
           7       0.97      0.97      0.97      1459
           8       0.96      0.96      0.96      1365
           9       0.94      0.95      0.95      1391

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97     14000



In [31]:
confusion_matrix(y_test,y_pred3)

array([[1371,    1,    0,    0,    1,    0,    3,    0,    4,    1],
       [   0, 1552,    7,    5,    1,    2,    2,    5,    0,    1],
       [  11,    2, 1347,    4,    6,    1,    6,   11,    9,    1],
       [   1,    1,   15, 1370,    0,   15,    2,   13,    8,    3],
       [   5,    3,    2,    0, 1312,    0,    8,    0,    2,   33],
       [   4,    3,    4,   14,    1, 1205,   10,    2,   11,    9],
       [   9,    2,    1,    0,    2,    6, 1353,    0,    2,    0],
       [   2,    5,   14,    0,    8,    0,    0, 1414,    1,   15],
       [   2,    3,    7,   11,    3,   10,    5,    0, 1308,   16],
       [   7,    8,    1,   17,   13,    4,    1,   10,   12, 1318]])