In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

data= fetch_openml('mnist_784', version=1)#Get data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"],data["target"]],columns = data["feature_names"]+["target"])

In [6]:
#Making about part of our data null
fivePer = int(0.05*dfData.shape[0])
allInds = np.arange(0,dfData.shape[0],1)
for col in dfData:
    if col == "target":
        continue
    #get at most 5% unique indeies and set those values to null
    indsToNull = np.unique(np.random.choice(allInds,replace=True,size=fivePer))
    dfData[col].iloc[indsToNull] = np.nan

In [3]:
#High number of rows and columns means it's very likely every row and column contains null values
print("Number of null containing rows:",pd.isnull(dfData.astype(float).sum(axis=1,skipna=False)).sum())
print("Number of null containing columns:",pd.isnull(dfData.astype(float).sum(axis=0,skipna=False)).sum())
print("Data shape:",dfData.shape)

Number of null containing rows: 70000
Number of null containing columns: 784
Data shape: (70000, 785)


In [8]:
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in stratSplit.split(dfData[data["feature_names"]], dfData["target"]):
    X_train = dfData[data["feature_names"]].iloc[train_index].reset_index(drop=True)
    X_test = dfData[data["feature_names"]].iloc[test_index].reset_index(drop=True)
    
    y_train = dfData["target"].iloc[train_index].reset_index(drop=True)
    y_test = dfData["target"].iloc[test_index].reset_index(drop=True)
    
log_reg = LogisticRegression()
print("Trying to train with null values")
try:
    log_reg.fit(X_train,y_train)
except Exception as e:
    print(e)
    print("Can't train with null values")
# print("Accuracy with null values in data:",log_reg.score(X_test,y_test))

Trying to train with null values
Input contains NaN, infinity or a value too large for dtype('float64').
Can't train with null values


In [None]:
#Fill in missing data using subject/domain knowledge, or going back to the source to see if missing data
#can still be retrieved

In [5]:
#drop rows with null values
#tolerate at most 40 null values in a row
numColumns = X_train.shape[1]
X_trainDroppedRows = X_train.dropna(axis=0,how="any",thresh=numColumns-40)
print("Before:",X_train.shape)
print("After:",X_trainDroppedRows.shape)

tolPercMissing = 0.08
#tolerate at most 8% null values in a row
X_trainDroppedRows = X_train.dropna(axis=0,how="any",thresh=numColumns-numColumns*tolPercMissing)
X_trainDroppedRows = X_trainDroppedRows.fillna(0)
print("Before:",X_train.shape)
print("After:",X_trainDroppedRows.shape)
remainingIndeciesTrain = X_trainDroppedRows.index
y_trainDroppedRows = y_train.iloc[remainingIndeciesTrain]


X_testDroppedRows = X_test.dropna(axis=0,how="any",thresh=numColumns-numColumns*tolPercMissing)
remainingIndeciesTest = X_testDroppedRows.index
y_testDroppedRows = y_test.iloc[remainingIndeciesTest]
X_testDroppedRows = X_testDroppedRows.fillna(0)


log_reg = LogisticRegression()
log_reg.fit(X_trainDroppedRows,y_trainDroppedRows)
print("Accuracy with dropped rows with null values:",log_reg.score(X_testDroppedRows,y_testDroppedRows))

Before: (56000, 784)
After: (36656, 784)
Before: (56000, 784)
After: (55996, 784)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy with dropped rows with null values: 0.9145653260947211


In [6]:
#drop columns with null values
#ktolerate at most 1000 null values in a column
numRows = X_train.shape[0]
#tolerate at most 2000 null values in a column
X_trainDroppedColumns = X_train.dropna(axis=1,how="any",thresh=numRows-2000)
print("Before:",X_train.shape)
print("After:",X_trainDroppedColumns.shape)

tolPercMissing = 0.049
#tolerate at most 4.9% null values in a column 
#(in this case we know that at most 5% of each column is null)
X_trainDroppedColumns = X_train.dropna(axis=1,how="any",thresh=numRows-numRows*tolPercMissing)
print("Before:",X_train.shape)
print("After:",X_trainDroppedColumns.shape)
remainingColumns = X_trainDroppedColumns.columns

X_trainDroppedColumns = X_trainDroppedColumns.fillna(0)
X_testDroppedColumns = X_test[remainingColumns].fillna(0)

log_reg = LogisticRegression()
log_reg.fit(X_trainDroppedColumns,y_train)
print("Accuracy with dropped columns with null values:",log_reg.score(X_testDroppedColumns,y_test))

Before: (56000, 784)
After: (56000, 0)
Before: (56000, 784)
After: (56000, 576)
Accuracy with dropped columns with null values: 0.9125


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
#Simple Imputing 
from sklearn.impute import SimpleImputer
mean_imputer = SimpleImputer(strategy="mean")#"mean","median","most_frequent","constant"
#                        fill_value=0.5)

X_train_mean_imputed = mean_imputer.fit_transform(X_train)
X_test_mean_imputed = mean_imputer.transform(X_test)
log_reg = LogisticRegression()
log_reg.fit(X_train_mean_imputed,y_train)
print("Accuracy with mean imputed null values:",log_reg.score(X_test_mean_imputed,y_test))

Accuracy with mean imputed null values: 0.9175


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
#KNN Imputing
from sklearn.impute import KNNImputer
#Find N nearest samples (if features that neither is missing are close)
#Using 1 because large sample size and otherwise finding higher number of nearest neighbours would take long
knn_imputer = KNNImputer(n_neighbors=1)#different distance definions (documentation)

X_train_knn_imputed = knn_imputer.fit_transform(X_train)
X_test_knn_imputed = knn_imputer.transform(X_test)
log_reg = LogisticRegression()
log_reg.fit(X_train_knn_imputed,y_train)
print("Accuracy with knn imputed null values:",log_reg.score(X_test_knn_imputed,y_test))

In [None]:
#custom imputing based on knowledge of problem
def getNearestPixels(row,column,maxRow,maxColumn):
    nearestpixels = []
    #left
    if column>0:
        nearestpixels.append([row,column-1])
        #left diagonal down
        if row<maxRow:
            nearestpixels.append([row+1,column-1])
        #left diagonal up
        if row>0:
            nearestpixels.append([row-1,column-1])
    #right
    if column<maxColumn:
        nearestpixels.append([row,column+1])
        #right diagonal down
        if row<maxRow:
            nearestpixels.append([row+1,column+1])
        #right diagonal up
        if row>0:
            nearestpixels.append([row-1,column+1])
    
    #up
    if row>0:
        nearestpixels.append([row-1,column])
        
    #down
    if row<maxRow:
        nearestpixels.append([row+1,column])
    return nearestpixels

def imputeRow(pixels):
    reshaped = np.reshape(pixels.values,(int(np.sqrt(pixels.size)),-1))
    maxRow = reshaped.shape[0]
    maxColumn = reshaped.shape[1]
    for row in range(maxRow):
        for column in range(maxColumn):
            if pd.isnull(reshaped[row,column]):
                #if a pixel is null find the indecies of the surrounding pixels
                nearestpixels = getNearestPixels(row,column,maxRow-1,maxColumn-1)
                sur = []
                #get values of surrounding pixels
                for inds in nearestpixels:
                    sur.append(reshaped[inds[0],inds[1]])
                #Setting value of null to mean of surrounding pixels
                reshaped[row,column] = np.nanmean(sur)
                
    return reshaped.reshape(1,pixels.size)[0]
                
    
#Applying function to each row
#Transforming data since each function is return an array, sklearn doesn't like the returned dataformat
#so we put it back into standard nested list/array format
X_train_custom_impute = [x for x in X_train.apply(lambda row: imputeRow(row),axis=1).values]
X_test_custom_impute = [x for x in X_test.apply(lambda row: imputeRow(row),axis=1).values]
y_train_custom_impute = y_train.values
y_test_custom_impute = y_test.values

log_reg = LogisticRegression()
log_reg.fit(X_train_custom_impute,y_train_custom_impute)
print("Accuracy with custom imputed null values:",log_reg.score(X_test_custom_impute,y_test_custom_impute))