In [1]:
!pip install python-mnist



In [2]:
%run __init__.py

In [3]:
mndata = MNIST('../python-mnist/data/') #bring mnist training set into this notebook
train_x, train_y = mndata.load_training()
test_x, test_y = mndata.load_testing()

In [4]:
train_x = np.array(train_x)
train_y = np.array(train_y)
test_x = np.array(test_x)
test_y = np.array(test_y)

In [5]:
train_x.shape, len(train_x[0]), train_y.shape, test_x.shape, len(test_x[0]), test_y.shape

((60000, 784), 784, (60000,), (10000, 784), 784, (10000,))

In [6]:
def conf_matrix(pred_y, real_y):
    json_list = [];
    for value in set(real_y): #For each unique prediction.
        TP = sum(pred_y[real_y == value] == value) #When the real is the value, see if prediction is the value. TP
        FP = sum(pred_y[real_y != value] == value) #When the real is not the value, see if the prediction is the value. FP
        TN = sum(pred_y[real_y != value] != value) #When the real is not the value, see if the prediction is not the value. TN
        FN = sum(pred_y[real_y == value] != value) #When the real is the value, see if the prediction is not the value. FN
        a_dict = {'Actual_Value': value,
                  'True Positive': TP,
                  'False Positive': FP,
                  'True Negative': TN,
                  'False Negative': FN,
                  'Sensitivity': TP/(TP+FN),
                  'Specificity': TN/(FP+TN),
                  'Accuracy': (TP+TN)/(TP+FP+FN+TN),
                  'Precision': TP/(TP+FP)
                 }
        json_list.append(a_dict)
    return pd.DataFrame(json_list)

In [28]:
def common_errors(pred_y, real_y):
    mistakes_dict_list = []
    variables = sorted(list(set(real_y)))
    for variable in variables: #for each unique real value
        #Select the erroneous predictions.
        is_variable_mask = real_y == variable
        masked_preds = pred_y[is_variable_mask]
        incorrect_preds_mask = masked_preds != variable
        false_preds = masked_preds[incorrect_preds_mask]
        
        #Make a dict that holds each type of wrong prediction and the number of occurrences.
        #Note: when other_variable == variable, the number of occurrences will be 0.
        mistaken_as_dict = {str(other_variable):\
                            sum(false_preds == other_variable)\
                           for other_variable in variables}
        
        #add the real value to the dictionary to use as an index.
        mistaken_as_dict['real_value'] = variable
        
        mistakes_dict_list.append(mistaken_as_dict)
        #append it to the list of dictionaries
        #return the list as a df
    df = pd.DataFrame(mistakes_dict_list)
    return df.set_index('real_value')

In [8]:
def combo_analysis(pred_y, real_y):
    return pd.concat(conf_matrix(pred_y, real_y), common_errors(pred_y, real_y), axis = 1)

In [9]:
t1 = time()

knn = KNeighborsClassifier(n_jobs = -1)
knn.fit(train_x, train_y)

t2 = time()
print('time to fit: ', (t2-t1))

time to fit:  74.96041941642761


In [10]:
t1 = time()
print('score: ', knn.score(test_x, test_y))

t2 = time()
print('time to score: ', (t2-t1))

score:  0.9688
time to :  374.62475419044495


In [11]:
t1 = time()
pred_y = knn.predict(test_x)
t2 = time()
print('time to predict: ' (t2 - t1))

In [31]:
t1 = time()
display(conf_matrix(pred_y, test_y))
t2 = time()
print('time to do conf_matrix: ', (t2 - t1))


Unnamed: 0,Accuracy,Actual_Value,False Negative,False Positive,Precision,Sensitivity,Specificity,True Negative,True Positive
0,0.9957,0,6,37,0.963403,0.993878,0.995898,8983,974
1,0.9944,1,2,54,0.954507,0.998238,0.993909,8811,1133
2,0.9941,2,41,18,0.982161,0.960271,0.997993,8950,991
3,0.993,3,34,36,0.964427,0.966337,0.995996,8954,976
4,0.9939,4,38,23,0.976215,0.961303,0.99745,8995,944
5,0.9939,5,30,31,0.965286,0.966368,0.996596,9077,862
6,0.9969,6,13,18,0.981308,0.98643,0.998009,9024,945
7,0.992,7,40,40,0.961089,0.961089,0.995542,8932,988
8,0.9928,8,61,11,0.988095,0.937372,0.998781,9015,913
9,0.9909,9,47,44,0.956262,0.953419,0.995106,8947,962


time to do conf_matrix:  0.4135465621948242


In [30]:
t1 = time()
display(common_errors(pred_y, test_y))
t2 = time()
print('time to find common errors: ', (t2 - t1))


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
real_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1,1,0,0,1,2,1,0,0
1,0,0,2,0,0,0,0,0,0,0
2,11,8,0,2,1,0,1,15,3,0
3,0,3,3,0,1,13,1,6,3,4
4,3,7,0,0,0,0,4,2,1,21
5,5,0,0,12,2,0,4,1,2,4
6,5,3,0,0,3,2,0,0,0,0
7,0,22,4,0,3,0,0,0,0,11
8,8,3,5,13,6,12,5,5,0,4
9,5,7,3,9,7,3,1,10,2,0


time to find common errors:  0.03353738784790039


In [32]:
t1 = time()
combo_analysis(pred_y, test_y)
t2 = time()
print('time to do both: ', (t2 - t1))


TypeError: concat() got multiple values for argument 'axis'

In [14]:
len(pred_y), len(test_y)

(10000, 10000)

In [25]:
df = pd.DataFrame()

In [None]:
df.set_index()