In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols, logit
from sklearn.linear_model import LinearRegression
import scipy.stats as sp
from statsmodels.discrete.discrete_model import MNLogit

from skimage.io import imread, imshow
import skimage.filters as filter
import skimage.feature as feature
from skimage.measure import label
from tqdm import tqdm
import os
glow='GLOW Data2.xlsx'; lbws='LBWSdata.xlsx'
main='Reduced MNIST Data'; training='/Reduced Trainging data'; testing='/Reduced Testing data'

In [2]:
def find_corner(img):
    X_tl=np.where(img.sum(axis=1)!=0)[0][0]
    Y_tl=np.where(img[X_tl]!=0)[0][0]
    Y_bl=np.where(img.sum(axis=0)!=0)[0][0]
    X_bl=np.where(img[:,Y_bl]!=0)[0][0]
    X_tr=np.where(img.sum(axis=1)!=0)[0][-1]
    Y_tr=np.where(img[X_tr]!=0)[0][-1]
    Y_br=np.where(img.sum(axis=0)!=0)[0][-1]
    X_br=np.where(img[:,Y_br]!=0)[0][-1]
    return {'X_tl':X_tl,'Y_tl':Y_tl,'X_bl':X_bl,'Y_bl':Y_bl,'X_tr':X_tr,'Y_tr':Y_tr,'X_br':X_br,'Y_br':Y_br}
def segment_sum(img,origin,dist,side):
    row_sum=img[origin-dist:origin+dist].sum(axis=1).sum()
    col_sum=img[:,origin-dist:origin+dist].sum(axis=0).sum()
    return {side+'_row_sum':row_sum,side+'_col_sum':col_sum}
def center_sum(img,originX,originY,dist):
    return img[originX-dist:originX+dist,originY-dist:originY+dist].sum()



In [3]:
def img_data(img_og):
    img=feature.canny(img_og)
    
    outp={}
    
    #functions from the data science lesson
    outp.update({'max_col':max(img.sum(axis=0))})
    outp.update({'max_row':max(img.sum(axis=1))})
    outp.update({'total_pixel':img.sum()})
    outp.update({'width':(img == True).any(axis = 0).sum()})
    outp.update({'height':(img == True).any(axis = 1).sum()})
    outp.update({'edges':label(img).max()})

    #apply some to the image before appling canny
    outp.update({'max_col_og':max(img_og.sum(axis=0))})
    outp.update({'max_row_og':max(img_og.sum(axis=1))})
    outp.update({'total_pixel_og':img_og.sum()})

    #find the top left,top right, bottom left and bottom right most pixel on an image
    corners=find_corner(img)
    outp.update({name:value for name,value in zip(corners.keys(),corners.values())})

    #create columns and count the number of white pixels in these
    top=segment_sum(img,7,3,'top')
    outp.update({name:value for name,value in zip(top.keys(),top.values())})
    middle=segment_sum(img,14,3,'mid')
    outp.update({name:value for name,value in zip(middle.keys(),middle.values())})
    bottom=segment_sum(img,21,3,'bot')
    outp.update({name:value for name,value in zip(bottom.keys(),bottom.values())})

    #same prosces for the original images
    outp.update({name:value for name,value in zip(corners.keys(),corners.values())})
    top=segment_sum(img_og,7,3,'top_og')
    outp.update({name:value for name,value in zip(top.keys(),top.values())})
    middle=segment_sum(img_og,14,3,'mid_og')
    outp.update({name:value for name,value in zip(middle.keys(),middle.values())})
    bottom=segment_sum(img_og,21,3,'bot_og')
    outp.update({name:value for name,value in zip(bottom.keys(),bottom.values())})

    #create a square of a specific radious and count the number of pixels
    outp.update({'center':center_sum(img,14,14,4)})
    outp.update({'top_left':center_sum(img,7,7,4)})
    outp.update({'bot_left':center_sum(img,21,7,4)})
    outp.update({'top_right':center_sum(img,7,21,4)})
    outp.update({'bot_right':center_sum(img,21,21,4)})

    #same prosces for the original images
    outp.update({'center_og':center_sum(img_og,14,14,4)})
    outp.update({'top_left_og':center_sum(img_og,7,7,4)})
    outp.update({'bot_left_og':center_sum(img_og,21,7,4)})
    outp.update({'top_right_og':center_sum(img_og,7,21,4)})
    outp.update({'bot_right_og':center_sum(img_og,21,21,4)})
    return outp


In [4]:
def create_image_dataframe(train_or_test,from_folder=range(10)):
    imgdata=pd.DataFrame()
    folder_list=[f'/{item}/' for item in from_folder]
    for folder,i in zip(folder_list,range(len(folder_list))):
        path = main+train_or_test+folder
        files = os.listdir(path)
        for file in files:
            imgdata=pd.concat([imgdata,pd.DataFrame(img_data(imread(main+train_or_test+folder+file)),index=[i])])
        print(f'finnished {train_or_test} {folder}.',end='\r')
    print(f'finnished {train_or_test}.    ')
    return imgdata.reset_index().rename(columns={'index':'true_value'})

In [5]:
def make_1D_image_array(train_or_test,from_folder=range(10),safety=False):
    img_1D_array=pd.DataFrame()
    folder_list=[f'/{item}/' for item in from_folder]
    for folder,i in zip(folder_list,range(len(folder_list))):
        path = main+train_or_test+folder
        files = os.listdir(path)
        for file in files:
            img_1D_array=pd.concat([img_1D_array,pd.DataFrame({folder+file:imread(main+train_or_test+folder+file).reshape(784)})],axis=1)
    return img_1D_array

In [6]:
make_1D_image_array(training,from_folder=[3])

Unnamed: 0,/3/5132.jpg,/3/5133.jpg,/3/5134.jpg,/3/5135.jpg,/3/5136.jpg,/3/5137.jpg,/3/5138.jpg,/3/5139.jpg,/3/5140.jpg,/3/5141.jpg,...,/3/6122.jpg,/3/6123.jpg,/3/6124.jpg,/3/6125.jpg,/3/6126.jpg,/3/6127.jpg,/3/6128.jpg,/3/6129.jpg,/3/6130.jpg,/3/6131.jpg
0,0,0,0,0,6,0,5,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,11,0,19,0,0,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,11,0,2,0,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## exercise 2

In [7]:
imgdata_train=create_image_dataframe(training); imgdata_test=create_image_dataframe(testing)

finnished /Reduced Trainging data.    
finnished /Reduced Testing data.    


In [8]:
model=MNLogit.from_formula('true_value ~ '+' + '.join(imgdata_train.drop(columns='true_value').columns),imgdata_train).fit(disp=0)



In [9]:
preded=model.predict(imgdata_test)

In [10]:
a=(preded.idxmax(axis=1)==imgdata_test.true_value).mean()
print(f'{a:.2%} succes rate')

91.95% succes rate


In [11]:
model.pred_table()

array([[973.,   0.,   5.,   1.,   1.,   3.,   7.,   1.,   7.,   2.],
       [  0., 981.,   8.,   1.,   1.,   4.,   1.,   3.,   1.,   0.],
       [ 11.,   5., 881.,  41.,   5.,  25.,  16.,   5.,  11.,   0.],
       [  0.,   3.,  41., 884.,   0.,  40.,   6.,   6.,  13.,   7.],
       [  3.,   3.,   2.,   1., 933.,   4.,   6.,   8.,   3.,  37.],
       [  3.,   2.,  38.,  74.,   3., 825.,  14.,  10.,  19.,  12.],
       [  4.,   6.,  15.,   0.,   6.,  11., 950.,   0.,   8.,   0.],
       [  2.,   1.,   4.,   5.,  12.,   1.,   0., 952.,   2.,  21.],
       [  5.,   7.,   9.,  10.,   5.,  14.,   7.,   3., 923.,  17.],
       [  6.,   2.,   2.,   4.,  25.,   9.,   0.,  14.,  14., 924.]])

In [12]:
imgdata_train

Unnamed: 0,true_value,max_col,max_row,total_pixel,width,height,edges,max_col_og,max_row_og,total_pixel_og,...,center,top_left,bot_left,top_right,bot_right,center_og,top_left_og,bot_left_og,top_right_og,bot_right_og
0,0,14,8,102,18,21,2,5036,3271,47637,...,15,13,9,7,15,3702,7197,4504,3182,6867
1,0,8,8,86,16,21,2,4786,3062,44903,...,23,1,9,11,8,8850,149,5870,6897,3660
2,0,12,10,109,20,22,2,4803,3800,49680,...,3,17,12,10,16,492,6756,6613,4157,7574
3,0,12,9,105,17,22,2,3811,2795,36262,...,7,5,16,12,19,1219,1155,4454,4972,5413
4,0,10,9,113,21,21,2,4023,4292,53158,...,9,11,14,9,20,1477,5340,7117,4205,9343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9,8,10,91,16,21,2,2875,3040,29499,...,20,10,4,14,3,8685,1157,1829,3937,684
9996,9,10,11,101,19,22,2,2862,3318,35123,...,19,9,7,13,5,8159,1395,2093,3927,1188
9997,9,9,8,84,14,21,2,3346,3270,34203,...,22,5,4,9,4,8623,830,1061,4170,946
9998,9,8,8,96,16,21,1,3085,3742,34801,...,19,4,9,16,3,10581,854,2987,4432,417
