In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols, logit
from sklearn.linear_model import LinearRegression
import scipy.stats as sp
from statsmodels.discrete.discrete_model import MNLogit

from skimage.io import imread, imshow
import skimage.filters as filter
import skimage.feature as feature
from skimage.measure import label
from tqdm import tqdm
import os
glow='GLOW Data2.xlsx'; lbws='LBWSdata.xlsx'
main='Reduced MNIST Data'; training='/Reduced Trainging data'; testing='/Reduced Testing data'

In [2]:
def find_corner(img):
    X_tl=np.where(img.sum(axis=1)!=0)[0][0]
    Y_tl=np.where(img[X_tl]!=0)[0][0]
    Y_bl=np.where(img.sum(axis=0)!=0)[0][0]
    X_bl=np.where(img[:,Y_bl]!=0)[0][0]
    X_tr=np.where(img.sum(axis=1)!=0)[0][-1]
    Y_tr=np.where(img[X_tr]!=0)[0][-1]
    Y_br=np.where(img.sum(axis=0)!=0)[0][-1]
    X_br=np.where(img[:,Y_br]!=0)[0][-1]
    return {'X_tl':X_tl,'Y_tl':Y_tl,'X_bl':X_bl,'Y_bl':Y_bl,'X_tr':X_tr,'Y_tr':Y_tr,'X_br':X_br,'Y_br':Y_br}
def segment_sum(img,origin,dist,side):
    row_sum=img[origin-dist:origin+dist].sum(axis=1).sum()
    col_sum=img[:,origin-dist:origin+dist].sum(axis=0).sum()
    return {side+'_row_sum':row_sum,side+'_col_sum':col_sum}
def center_sum(img,originX,originY,dist):
    return img[originX-dist:originX+dist,originY-dist:originY+dist].sum()



In [3]:
def img_data(img_og):
    img=feature.canny(img_og)
    
    outp={}
    
    #functions from the data science lesson
    outp.update({'max_col':max(img.sum(axis=0))})
    outp.update({'max_row':max(img.sum(axis=1))})
    outp.update({'total_pixel':img.sum()})
    outp.update({'width':(img == True).any(axis = 0).sum()})
    outp.update({'height':(img == True).any(axis = 1).sum()})
    outp.update({'edges':label(img).max()})

    #apply some to the image before appling canny
    outp.update({'max_col_og':max(img_og.sum(axis=0))})
    outp.update({'max_row_og':max(img_og.sum(axis=1))})
    outp.update({'total_pixel_og':img_og.sum()})

    #find the top left,top right, bottom left and bottom right most pixel on an image
    corners=find_corner(img)
    outp.update({name:value for name,value in zip(corners.keys(),corners.values())})

    #create columns and count the number of white pixels in these
    top=segment_sum(img,7,3,'top')
    outp.update({name:value for name,value in zip(top.keys(),top.values())})
    middle=segment_sum(img,14,3,'mid')
    outp.update({name:value for name,value in zip(middle.keys(),middle.values())})
    bottom=segment_sum(img,21,3,'bot')
    outp.update({name:value for name,value in zip(bottom.keys(),bottom.values())})

    #same prosces for the original images
    outp.update({name:value for name,value in zip(corners.keys(),corners.values())})
    top=segment_sum(img_og,7,3,'top_og')
    outp.update({name:value for name,value in zip(top.keys(),top.values())})
    middle=segment_sum(img_og,14,3,'mid_og')
    outp.update({name:value for name,value in zip(middle.keys(),middle.values())})
    bottom=segment_sum(img_og,21,3,'bot_og')
    outp.update({name:value for name,value in zip(bottom.keys(),bottom.values())})

    #create a square of a specific radious and count the number of pixels
    outp.update({'center':center_sum(img,14,14,4)})
    outp.update({'top_left':center_sum(img,7,7,4)})
    outp.update({'bot_left':center_sum(img,21,7,4)})
    outp.update({'top_right':center_sum(img,7,21,4)})
    outp.update({'bot_right':center_sum(img,21,21,4)})

    #same prosces for the original images
    outp.update({'center_og':center_sum(img_og,14,14,4)})
    outp.update({'top_left_og':center_sum(img_og,7,7,4)})
    outp.update({'bot_left_og':center_sum(img_og,21,7,4)})
    outp.update({'top_right_og':center_sum(img_og,7,21,4)})
    outp.update({'bot_right_og':center_sum(img_og,21,21,4)})
    return outp


In [4]:
def create_image_dataframe(train_or_test,from_folder=range(10)):
    imgdata=pd.DataFrame()
    folder_list=[f'/{item}/' for item in from_folder]
    for folder,i in zip(folder_list,range(len(folder_list))):
        path = main+train_or_test+folder
        files = os.listdir(path)
        for file in files:
            imgdata=pd.concat([imgdata,pd.DataFrame(img_data(imread(main+train_or_test+folder+file)),index=[i])])
        print(f'finnished {train_or_test} {folder}.',end='\r')
    print(f'finnished {train_or_test}.    ')
    return imgdata.reset_index().rename(columns={'index':'true_value'})

In [5]:
def make_1D_image_array(train_or_test,from_folder=range(10),safety=False):
    img_1D_array=pd.DataFrame()
    folder_list=[f'/{item}/' for item in from_folder]
    for folder,i in zip(folder_list,range(len(folder_list))):
        path = main+train_or_test+folder
        files = os.listdir(path)
        for file in files:
            img_1D_array=pd.concat([img_1D_array,pd.DataFrame({folder+file:imread(main+train_or_test+folder+file).reshape(784)})],axis=1)
    return img_1D_array

In [6]:
make_1D_image_array(training,from_folder=[3])

Unnamed: 0,/3/5132.jpg,/3/5133.jpg,/3/5134.jpg,/3/5135.jpg,/3/5136.jpg,/3/5137.jpg,/3/5138.jpg,/3/5139.jpg,/3/5140.jpg,/3/5141.jpg,...,/3/6122.jpg,/3/6123.jpg,/3/6124.jpg,/3/6125.jpg,/3/6126.jpg,/3/6127.jpg,/3/6128.jpg,/3/6129.jpg,/3/6130.jpg,/3/6131.jpg
0,0,0,0,0,6,0,5,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,11,0,19,0,0,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,11,0,2,0,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## exercise 2

In [7]:
imgdata_train=create_image_dataframe(training); imgdata_test=create_image_dataframe(testing)

finnished /Reduced Trainging data.    
finnished /Reduced Testing data.    


In [8]:
model=MNLogit.from_formula('true_value ~ '+' + '.join(imgdata_train.drop(columns='true_value').columns),imgdata_train).fit(disp=0)



In [9]:
preded=model.predict(imgdata_test)

In [10]:
a=(preded.idxmax(axis=1)==imgdata_test.true_value).mean()
print(f'{a:.2%} succes rate')

91.95% succes rate


In [13]:
preded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,9.999401e-01,6.981965e-17,5.633778e-05,1.047813e-06,4.682702e-11,0.000001,3.932051e-08,7.704059e-08,3.147217e-07,6.519092e-07
1,9.739369e-01,4.627870e-13,2.326036e-02,1.512142e-04,2.967940e-09,0.002269,9.576586e-05,5.547925e-08,2.805401e-04,6.316579e-06
2,9.979947e-01,4.435918e-14,1.044656e-03,1.646770e-05,2.820240e-07,0.000012,3.916854e-06,5.208649e-06,1.856960e-04,7.368019e-04
3,9.982902e-01,4.779501e-13,1.113002e-03,1.717971e-05,1.124789e-07,0.000014,3.274979e-04,1.553822e-05,8.024472e-05,1.421469e-04
4,9.998284e-01,9.616796e-19,9.790424e-05,2.994681e-08,7.269257e-09,0.000002,1.232981e-05,3.055901e-09,5.543506e-05,3.816188e-06
...,...,...,...,...,...,...,...,...,...,...
1995,3.207805e-05,7.694598e-11,1.029876e-07,1.912110e-06,5.210963e-04,0.000052,8.215527e-06,1.143936e-05,6.336835e-03,9.930361e-01
1996,8.369238e-08,9.908723e-08,4.462024e-09,2.344739e-06,5.282310e-01,0.000615,2.562176e-06,8.211727e-04,3.690222e-03,4.666380e-01
1997,5.609871e-07,1.352254e-09,3.511451e-08,3.842039e-05,5.059428e-03,0.000205,4.043722e-06,2.381899e-03,8.143756e-04,9.914964e-01
1998,2.830001e-05,4.064793e-06,5.086260e-06,5.940813e-05,3.054440e-01,0.000582,4.884376e-05,3.823378e-03,3.325583e-02,6.567491e-01


## exercise 3
